From 8dc26dbe93d10af108e0295c86f83b511bb13cac Mon Sep 17 00:00:00 2001 From: alemuntoni Date: Tue, 15 Jun 2021 12:27:04 +0200 Subject: [PATCH] update eigen to 3.3.9 --- eigenlib/Eigen/Cholesky | 5 + eigenlib/Eigen/Core | 40 +- eigenlib/Eigen/Eigenvalues | 8 +- eigenlib/Eigen/Geometry | 4 +- eigenlib/Eigen/LU | 4 + eigenlib/Eigen/QR | 8 +- eigenlib/Eigen/QtAlignedMalloc | 6 +- eigenlib/Eigen/SVD | 4 + eigenlib/Eigen/Sparse | 2 + eigenlib/Eigen/SparseQR | 1 - eigenlib/Eigen/StdDeque | 2 +- eigenlib/Eigen/StdList | 2 +- eigenlib/Eigen/StdVector | 2 +- eigenlib/Eigen/src/Cholesky/LDLT.h | 14 +- eigenlib/Eigen/src/Cholesky/LLT.h | 26 +- eigenlib/Eigen/src/Core/Array.h | 10 +- eigenlib/Eigen/src/Core/ArrayBase.h | 12 +- eigenlib/Eigen/src/Core/ArrayWrapper.h | 6 +- eigenlib/Eigen/src/Core/AssignEvaluator.h | 33 +- eigenlib/Eigen/src/Core/Assign_MKL.h | 6 +- eigenlib/Eigen/src/Core/ConditionEstimator.h | 2 +- eigenlib/Eigen/src/Core/CoreEvaluators.h | 43 +- eigenlib/Eigen/src/Core/CwiseNullaryOp.h | 80 +- eigenlib/Eigen/src/Core/CwiseUnaryView.h | 2 + eigenlib/Eigen/src/Core/DenseBase.h | 27 +- eigenlib/Eigen/src/Core/DenseStorage.h | 35 +- eigenlib/Eigen/src/Core/Diagonal.h | 7 +- eigenlib/Eigen/src/Core/Dot.h | 17 +- eigenlib/Eigen/src/Core/EigenBase.h | 4 + eigenlib/Eigen/src/Core/GeneralProduct.h | 21 +- eigenlib/Eigen/src/Core/GenericPacketMath.h | 15 +- eigenlib/Eigen/src/Core/IO.h | 14 - eigenlib/Eigen/src/Core/Map.h | 17 +- eigenlib/Eigen/src/Core/MapBase.h | 11 +- eigenlib/Eigen/src/Core/MathFunctions.h | 95 +-- eigenlib/Eigen/src/Core/MathFunctionsImpl.h | 23 + eigenlib/Eigen/src/Core/Matrix.h | 2 - eigenlib/Eigen/src/Core/MatrixBase.h | 42 +- eigenlib/Eigen/src/Core/NumTraits.h | 2 + eigenlib/Eigen/src/Core/PermutationMatrix.h | 28 - eigenlib/Eigen/src/Core/PlainObjectBase.h | 28 +- eigenlib/Eigen/src/Core/Product.h | 10 +- eigenlib/Eigen/src/Core/ProductEvaluators.h | 65 +- eigenlib/Eigen/src/Core/Redux.h | 2 +- eigenlib/Eigen/src/Core/Ref.h | 7 +- eigenlib/Eigen/src/Core/SelfAdjointView.h | 6 +- eigenlib/Eigen/src/Core/SelfCwiseBinaryOp.h | 12 +- eigenlib/Eigen/src/Core/Solve.h | 4 +- eigenlib/Eigen/src/Core/SolveTriangular.h | 9 +- eigenlib/Eigen/src/Core/StableNorm.h | 5 +- eigenlib/Eigen/src/Core/Transpose.h | 2 + eigenlib/Eigen/src/Core/Transpositions.h | 41 +- eigenlib/Eigen/src/Core/TriangularMatrix.h | 8 +- eigenlib/Eigen/src/Core/arch/AVX/Complex.h | 36 +- eigenlib/Eigen/src/Core/arch/AVX/PacketMath.h | 24 +- .../src/Core/arch/AVX512/MathFunctions.h | 95 ++- .../Eigen/src/Core/arch/AVX512/PacketMath.h | 731 +++++++++--------- .../Eigen/src/Core/arch/AltiVec/Complex.h | 37 +- .../Eigen/src/Core/arch/AltiVec/PacketMath.h | 58 +- eigenlib/Eigen/src/Core/arch/CUDA/Half.h | 220 ++++-- .../Eigen/src/Core/arch/CUDA/PacketMath.h | 2 +- .../Eigen/src/Core/arch/CUDA/PacketMathHalf.h | 9 +- .../Eigen/src/Core/arch/Default/ConjHelper.h | 29 + eigenlib/Eigen/src/Core/arch/NEON/Complex.h | 12 +- .../Eigen/src/Core/arch/NEON/PacketMath.h | 98 ++- eigenlib/Eigen/src/Core/arch/SSE/Complex.h | 40 +- eigenlib/Eigen/src/Core/arch/SSE/PacketMath.h | 24 +- .../Eigen/src/Core/arch/SSE/TypeCasting.h | 28 +- .../Eigen/src/Core/arch/ZVector/Complex.h | 3 + .../Eigen/src/Core/arch/ZVector/PacketMath.h | 2 +- .../src/Core/functors/AssignmentFunctors.h | 2 +- .../Eigen/src/Core/functors/BinaryFunctors.h | 25 +- .../Eigen/src/Core/functors/NullaryFunctors.h | 15 +- .../Eigen/src/Core/functors/StlFunctors.h | 4 + .../Eigen/src/Core/functors/UnaryFunctors.h | 2 +- .../Core/products/GeneralBlockPanelKernel.h | 22 +- .../src/Core/products/GeneralMatrixMatrix.h | 45 +- .../products/GeneralMatrixMatrixTriangular.h | 79 +- .../GeneralMatrixMatrixTriangular_BLAS.h | 20 +- .../Core/products/GeneralMatrixMatrix_BLAS.h | 25 +- .../src/Core/products/GeneralMatrixVector.h | 8 +- .../Core/products/GeneralMatrixVector_BLAS.h | 19 +- .../Eigen/src/Core/products/Parallelizer.h | 18 +- .../Core/products/SelfadjointMatrixMatrix.h | 54 +- .../products/SelfadjointMatrixMatrix_BLAS.h | 72 +- .../Core/products/SelfadjointMatrixVector.h | 14 +- .../products/SelfadjointMatrixVector_BLAS.h | 9 +- .../src/Core/products/SelfadjointProduct.h | 4 +- .../Core/products/TriangularMatrixMatrix.h | 89 ++- .../products/TriangularMatrixMatrix_BLAS.h | 65 +- .../Core/products/TriangularMatrixVector.h | 22 +- .../products/TriangularMatrixVector_BLAS.h | 46 +- .../Core/products/TriangularSolverMatrix.h | 62 +- .../products/TriangularSolverMatrix_BLAS.h | 52 +- eigenlib/Eigen/src/Core/util/BlasUtil.h | 115 ++- .../src/Core/util/DisableStupidWarnings.h | 27 +- .../Eigen/src/Core/util/ForwardDeclarations.h | 6 +- eigenlib/Eigen/src/Core/util/MKL_support.h | 10 +- eigenlib/Eigen/src/Core/util/Macros.h | 79 +- eigenlib/Eigen/src/Core/util/Memory.h | 78 +- eigenlib/Eigen/src/Core/util/Meta.h | 76 ++ .../src/Core/util/ReenableStupidWarnings.h | 8 +- eigenlib/Eigen/src/Core/util/StaticAssert.h | 120 +-- eigenlib/Eigen/src/Core/util/XprHelper.h | 19 +- .../src/Eigenvalues/ComplexEigenSolver.h | 6 +- eigenlib/Eigen/src/Eigenvalues/ComplexSchur.h | 9 +- .../src/Eigenvalues/GeneralizedEigenSolver.h | 5 +- .../src/Eigenvalues/MatrixBaseEigenvalues.h | 2 - eigenlib/Eigen/src/Eigenvalues/RealSchur.h | 31 +- .../src/Eigenvalues/SelfAdjointEigenSolver.h | 10 +- .../SelfAdjointEigenSolver_LAPACKE.h | 23 +- eigenlib/Eigen/src/Geometry/AngleAxis.h | 2 +- eigenlib/Eigen/src/Geometry/Quaternion.h | 61 +- eigenlib/Eigen/src/Geometry/Scaling.h | 2 +- eigenlib/Eigen/src/Geometry/Transform.h | 4 +- eigenlib/Eigen/src/Geometry/Translation.h | 6 - eigenlib/Eigen/src/Geometry/Umeyama.h | 2 +- .../Eigen/src/Geometry/arch/Geometry_SSE.h | 60 +- .../Eigen/src/Householder/BlockHouseholder.h | 3 +- .../BasicPreconditioners.h | 27 +- .../ConjugateGradient.h | 5 +- eigenlib/Eigen/src/Jacobi/Jacobi.h | 261 ++++--- eigenlib/Eigen/src/LU/InverseImpl.h | 2 +- eigenlib/Eigen/src/LU/PartialPivLU.h | 5 +- eigenlib/Eigen/src/LU/arch/Inverse_SSE.h | 4 +- .../Eigen/src/OrderingMethods/Eigen_Colamd.h | 2 +- .../Eigen/src/PaStiXSupport/PaStiXSupport.h | 8 +- .../Eigen/src/PardisoSupport/PardisoSupport.h | 3 +- eigenlib/Eigen/src/QR/ColPivHouseholderQR.h | 12 +- eigenlib/Eigen/src/SVD/BDCSVD.h | 171 ++-- eigenlib/Eigen/src/SVD/JacobiSVD.h | 12 +- eigenlib/Eigen/src/SVD/JacobiSVD_LAPACKE.h | 5 +- eigenlib/Eigen/src/SVD/SVDBase.h | 4 +- .../Eigen/src/SVD/UpperBidiagonalization.h | 4 +- .../src/SparseCholesky/SimplicialCholesky.h | 2 +- .../SparseCholesky/SimplicialCholesky_impl.h | 2 +- eigenlib/Eigen/src/SparseCore/AmbiVector.h | 15 +- .../ConservativeSparseSparseProduct.h | 67 +- eigenlib/Eigen/src/SparseCore/SparseAssign.h | 5 +- .../src/SparseCore/SparseCompressedBase.h | 38 +- .../src/SparseCore/SparseCwiseBinaryOp.h | 10 + .../Eigen/src/SparseCore/SparseCwiseUnaryOp.h | 2 + .../src/SparseCore/SparseDiagonalProduct.h | 4 + eigenlib/Eigen/src/SparseCore/SparseMatrix.h | 17 +- .../src/SparseCore/SparseSelfAdjointView.h | 15 +- .../SparseSparseProductWithPruning.h | 22 +- eigenlib/Eigen/src/SparseCore/SparseView.h | 1 + eigenlib/Eigen/src/SparseLU/SparseLU.h | 6 +- eigenlib/Eigen/src/SparseQR/SparseQR.h | 26 +- eigenlib/Eigen/src/StlSupport/StdDeque.h | 6 +- eigenlib/Eigen/src/StlSupport/details.h | 14 +- .../Eigen/src/SuperLUSupport/SuperLUSupport.h | 4 +- .../Eigen/src/UmfPackSupport/UmfPackSupport.h | 101 ++- .../Eigen/src/plugins/ArrayCwiseBinaryOps.h | 2 +- eigenlib/howto.txt | 3 +- 155 files changed, 2860 insertions(+), 1893 deletions(-) create mode 100644 eigenlib/Eigen/src/Core/arch/Default/ConjHelper.h mode change 100755 => 100644 eigenlib/Eigen/src/Geometry/Scaling.h diff --git a/eigenlib/Eigen/Cholesky b/eigenlib/Eigen/Cholesky index 369d1f5e..1332b540 100644 --- a/eigenlib/Eigen/Cholesky +++ b/eigenlib/Eigen/Cholesky @@ -9,6 +9,7 @@ #define EIGEN_CHOLESKY_MODULE_H #include "Core" +#include "Jacobi" #include "src/Core/util/DisableStupidWarnings.h" @@ -31,7 +32,11 @@ #include "src/Cholesky/LLT.h" #include "src/Cholesky/LDLT.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/Cholesky/LLT_LAPACKE.h" #endif diff --git a/eigenlib/Eigen/Core b/eigenlib/Eigen/Core index 82558155..ac7c5b30 100644 --- a/eigenlib/Eigen/Core +++ b/eigenlib/Eigen/Core @@ -14,6 +14,22 @@ // first thing Eigen does: stop the compiler from committing suicide #include "src/Core/util/DisableStupidWarnings.h" +#if defined(__CUDACC__) && !defined(EIGEN_NO_CUDA) + #define EIGEN_CUDACC __CUDACC__ +#endif + +#if defined(__CUDA_ARCH__) && !defined(EIGEN_NO_CUDA) + #define EIGEN_CUDA_ARCH __CUDA_ARCH__ +#endif + +#if defined(__CUDACC_VER_MAJOR__) && (__CUDACC_VER_MAJOR__ >= 9) +#define EIGEN_CUDACC_VER ((__CUDACC_VER_MAJOR__ * 10000) + (__CUDACC_VER_MINOR__ * 100)) +#elif defined(__CUDACC_VER__) +#define EIGEN_CUDACC_VER __CUDACC_VER__ +#else +#define EIGEN_CUDACC_VER 0 +#endif + // Handle NVCC/CUDA/SYCL #if defined(__CUDACC__) || defined(__SYCL_DEVICE_ONLY__) // Do not try asserts on CUDA and SYCL! @@ -37,9 +53,9 @@ #endif #define EIGEN_DEVICE_FUNC __host__ __device__ - // We need math_functions.hpp to ensure that that EIGEN_USING_STD_MATH macro + // We need cuda_runtime.h to ensure that that EIGEN_USING_STD_MATH macro // works properly on the device side - #include + #include #else #define EIGEN_DEVICE_FUNC #endif @@ -155,6 +171,9 @@ #ifdef __AVX512DQ__ #define EIGEN_VECTORIZE_AVX512DQ #endif + #ifdef __AVX512ER__ + #define EIGEN_VECTORIZE_AVX512ER + #endif #endif // include files @@ -229,7 +248,7 @@ #if defined __CUDACC__ #define EIGEN_VECTORIZE_CUDA #include - #if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500 + #if EIGEN_CUDACC_VER >= 70500 #define EIGEN_HAS_CUDA_FP16 #endif #endif @@ -260,7 +279,10 @@ #include #include #include -#include +#include +#ifndef EIGEN_NO_IO + #include +#endif #include #include #include @@ -321,12 +343,16 @@ inline static const char *SimdInstructionSetsInUse(void) { #error Eigen2-support is only available up to version 3.2. Please go to "http://eigen.tuxfamily.org/index.php?title=Eigen2" for further information #endif +namespace Eigen { + // we use size_t frequently and we'll never remember to prepend it with std:: everytime just to // ensure QNX/QCC support using std::size_t; // gcc 4.6.0 wants std:: for ptrdiff_t using std::ptrdiff_t; +} + /** \defgroup Core_Module Core module * This is the main module of Eigen providing dense matrix and vector support * (both fixed and dynamic size) with all the features corresponding to a BLAS library @@ -348,10 +374,13 @@ using std::ptrdiff_t; #include "src/Core/MathFunctions.h" #include "src/Core/GenericPacketMath.h" #include "src/Core/MathFunctionsImpl.h" +#include "src/Core/arch/Default/ConjHelper.h" #if defined EIGEN_VECTORIZE_AVX512 #include "src/Core/arch/SSE/PacketMath.h" + #include "src/Core/arch/SSE/MathFunctions.h" #include "src/Core/arch/AVX/PacketMath.h" + #include "src/Core/arch/AVX/MathFunctions.h" #include "src/Core/arch/AVX512/PacketMath.h" #include "src/Core/arch/AVX512/MathFunctions.h" #elif defined EIGEN_VECTORIZE_AVX @@ -363,6 +392,7 @@ using std::ptrdiff_t; #include "src/Core/arch/AVX/MathFunctions.h" #include "src/Core/arch/AVX/Complex.h" #include "src/Core/arch/AVX/TypeCasting.h" + #include "src/Core/arch/SSE/TypeCasting.h" #elif defined EIGEN_VECTORIZE_SSE #include "src/Core/arch/SSE/PacketMath.h" #include "src/Core/arch/SSE/MathFunctions.h" @@ -405,6 +435,7 @@ using std::ptrdiff_t; // on CUDA devices #include "src/Core/arch/CUDA/Complex.h" +#include "src/Core/IO.h" #include "src/Core/DenseCoeffsBase.h" #include "src/Core/DenseBase.h" #include "src/Core/MatrixBase.h" @@ -452,7 +483,6 @@ using std::ptrdiff_t; #include "src/Core/Redux.h" #include "src/Core/Visitor.h" #include "src/Core/Fuzzy.h" -#include "src/Core/IO.h" #include "src/Core/Swap.h" #include "src/Core/CommaInitializer.h" #include "src/Core/GeneralProduct.h" diff --git a/eigenlib/Eigen/Eigenvalues b/eigenlib/Eigen/Eigenvalues index 009e529e..7d6ac787 100644 --- a/eigenlib/Eigen/Eigenvalues +++ b/eigenlib/Eigen/Eigenvalues @@ -10,14 +10,14 @@ #include "Core" -#include "src/Core/util/DisableStupidWarnings.h" - #include "Cholesky" #include "Jacobi" #include "Householder" #include "LU" #include "Geometry" +#include "src/Core/util/DisableStupidWarnings.h" + /** \defgroup Eigenvalues_Module Eigenvalues module * * @@ -45,7 +45,11 @@ #include "src/Eigenvalues/GeneralizedEigenSolver.h" #include "src/Eigenvalues/MatrixBaseEigenvalues.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/Eigenvalues/RealSchur_LAPACKE.h" #include "src/Eigenvalues/ComplexSchur_LAPACKE.h" #include "src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h" diff --git a/eigenlib/Eigen/Geometry b/eigenlib/Eigen/Geometry index 716d5295..da88c03b 100644 --- a/eigenlib/Eigen/Geometry +++ b/eigenlib/Eigen/Geometry @@ -10,12 +10,12 @@ #include "Core" -#include "src/Core/util/DisableStupidWarnings.h" - #include "SVD" #include "LU" #include +#include "src/Core/util/DisableStupidWarnings.h" + /** \defgroup Geometry_Module Geometry module * * This module provides support for: diff --git a/eigenlib/Eigen/LU b/eigenlib/Eigen/LU index 6f6c5562..6418a86e 100644 --- a/eigenlib/Eigen/LU +++ b/eigenlib/Eigen/LU @@ -28,7 +28,11 @@ #include "src/LU/FullPivLU.h" #include "src/LU/PartialPivLU.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/LU/PartialPivLU_LAPACKE.h" #endif #include "src/LU/Determinant.h" diff --git a/eigenlib/Eigen/QR b/eigenlib/Eigen/QR index 80838e3b..1be1863a 100644 --- a/eigenlib/Eigen/QR +++ b/eigenlib/Eigen/QR @@ -10,12 +10,12 @@ #include "Core" -#include "src/Core/util/DisableStupidWarnings.h" - #include "Cholesky" #include "Jacobi" #include "Householder" +#include "src/Core/util/DisableStupidWarnings.h" + /** \defgroup QR_Module QR module * * @@ -36,7 +36,11 @@ #include "src/QR/ColPivHouseholderQR.h" #include "src/QR/CompleteOrthogonalDecomposition.h" #ifdef EIGEN_USE_LAPACKE +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/QR/HouseholderQR_LAPACKE.h" #include "src/QR/ColPivHouseholderQR_LAPACKE.h" #endif diff --git a/eigenlib/Eigen/QtAlignedMalloc b/eigenlib/Eigen/QtAlignedMalloc index 4044d5ac..4f07df02 100644 --- a/eigenlib/Eigen/QtAlignedMalloc +++ b/eigenlib/Eigen/QtAlignedMalloc @@ -14,7 +14,7 @@ #include "src/Core/util/DisableStupidWarnings.h" -void *qMalloc(size_t size) +void *qMalloc(std::size_t size) { return Eigen::internal::aligned_malloc(size); } @@ -24,10 +24,10 @@ void qFree(void *ptr) Eigen::internal::aligned_free(ptr); } -void *qRealloc(void *ptr, size_t size) +void *qRealloc(void *ptr, std::size_t size) { void* newPtr = Eigen::internal::aligned_malloc(size); - memcpy(newPtr, ptr, size); + std::memcpy(newPtr, ptr, size); Eigen::internal::aligned_free(ptr); return newPtr; } diff --git a/eigenlib/Eigen/SVD b/eigenlib/Eigen/SVD index 86143c23..5d0e75f7 100644 --- a/eigenlib/Eigen/SVD +++ b/eigenlib/Eigen/SVD @@ -37,7 +37,11 @@ #include "src/SVD/JacobiSVD.h" #include "src/SVD/BDCSVD.h" #if defined(EIGEN_USE_LAPACKE) && !defined(EIGEN_USE_LAPACKE_STRICT) +#ifdef EIGEN_USE_MKL +#include "mkl_lapacke.h" +#else #include "src/misc/lapacke.h" +#endif #include "src/SVD/JacobiSVD_LAPACKE.h" #endif diff --git a/eigenlib/Eigen/Sparse b/eigenlib/Eigen/Sparse index a2ef7a66..136e681a 100644 --- a/eigenlib/Eigen/Sparse +++ b/eigenlib/Eigen/Sparse @@ -25,7 +25,9 @@ #include "SparseCore" #include "OrderingMethods" +#ifndef EIGEN_MPL2_ONLY #include "SparseCholesky" +#endif #include "SparseLU" #include "SparseQR" #include "IterativeLinearSolvers" diff --git a/eigenlib/Eigen/SparseQR b/eigenlib/Eigen/SparseQR index a6f3b7f7..f5fc5fa7 100644 --- a/eigenlib/Eigen/SparseQR +++ b/eigenlib/Eigen/SparseQR @@ -28,7 +28,6 @@ * */ -#include "OrderingMethods" #include "src/SparseCore/SparseColEtree.h" #include "src/SparseQR/SparseQR.h" diff --git a/eigenlib/Eigen/StdDeque b/eigenlib/Eigen/StdDeque index be3a7f82..bc68397b 100644 --- a/eigenlib/Eigen/StdDeque +++ b/eigenlib/Eigen/StdDeque @@ -14,7 +14,7 @@ #include "Core" #include -#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */ +#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */ #define EIGEN_DEFINE_STL_DEQUE_SPECIALIZATION(...) diff --git a/eigenlib/Eigen/StdList b/eigenlib/Eigen/StdList index 07ba1297..4c6262c0 100644 --- a/eigenlib/Eigen/StdList +++ b/eigenlib/Eigen/StdList @@ -13,7 +13,7 @@ #include "Core" #include -#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */ +#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */ #define EIGEN_DEFINE_STL_LIST_SPECIALIZATION(...) diff --git a/eigenlib/Eigen/StdVector b/eigenlib/Eigen/StdVector index fdfc3776..0c4697ad 100644 --- a/eigenlib/Eigen/StdVector +++ b/eigenlib/Eigen/StdVector @@ -14,7 +14,7 @@ #include "Core" #include -#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 /* MSVC auto aligns in 64 bit builds */ +#if EIGEN_COMP_MSVC && EIGEN_OS_WIN64 && (EIGEN_MAX_STATIC_ALIGN_BYTES<=16) /* MSVC auto aligns up to 16 bytes in 64 bit builds */ #define EIGEN_DEFINE_STL_VECTOR_SPECIALIZATION(...) diff --git a/eigenlib/Eigen/src/Cholesky/LDLT.h b/eigenlib/Eigen/src/Cholesky/LDLT.h index fcee7b2e..15ccf24f 100644 --- a/eigenlib/Eigen/src/Cholesky/LDLT.h +++ b/eigenlib/Eigen/src/Cholesky/LDLT.h @@ -248,7 +248,7 @@ template class LDLT /** \brief Reports whether previous computation was successful. * * \returns \c Success if computation was succesful, - * \c NumericalIssue if the matrix.appears to be negative. + * \c NumericalIssue if the factorization failed because of a zero pivot. */ ComputationInfo info() const { @@ -305,7 +305,8 @@ template<> struct ldlt_inplace if (size <= 1) { transpositions.setIdentity(); - if (numext::real(mat.coeff(0,0)) > static_cast(0) ) sign = PositiveSemiDef; + if(size==0) sign = ZeroSign; + else if (numext::real(mat.coeff(0,0)) > static_cast(0) ) sign = PositiveSemiDef; else if (numext::real(mat.coeff(0,0)) < static_cast(0)) sign = NegativeSemiDef; else sign = ZeroSign; return true; @@ -376,6 +377,8 @@ template<> struct ldlt_inplace if((rs>0) && pivot_is_valid) A21 /= realAkk; + else if(rs>0) + ret = ret && (A21.array()==Scalar(0)).all(); if(found_zero_pivot && pivot_is_valid) ret = false; // factorization failed else if(!pivot_is_valid) found_zero_pivot = true; @@ -568,13 +571,14 @@ void LDLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) cons // more precisely, use pseudo-inverse of D (see bug 241) using std::abs; const typename Diagonal::RealReturnType vecD(vectorD()); - // In some previous versions, tolerance was set to the max of 1/highest and the maximal diagonal entry * epsilon - // as motivated by LAPACK's xGELSS: + // In some previous versions, tolerance was set to the max of 1/highest (or rather numeric_limits::min()) + // and the maximal diagonal entry * epsilon as motivated by LAPACK's xGELSS: // RealScalar tolerance = numext::maxi(vecD.array().abs().maxCoeff() * NumTraits::epsilon(),RealScalar(1) / NumTraits::highest()); // However, LDLT is not rank revealing, and so adjusting the tolerance wrt to the highest // diagonal element is not well justified and leads to numerical issues in some cases. // Moreover, Lapack's xSYTRS routines use 0 for the tolerance. - RealScalar tolerance = RealScalar(1) / NumTraits::highest(); + // Using numeric_limits::min() gives us more robustness to denormals. + RealScalar tolerance = (std::numeric_limits::min)(); for (Index i = 0; i < vecD.size(); ++i) { diff --git a/eigenlib/Eigen/src/Cholesky/LLT.h b/eigenlib/Eigen/src/Cholesky/LLT.h index 87ca8d42..e1624d21 100644 --- a/eigenlib/Eigen/src/Cholesky/LLT.h +++ b/eigenlib/Eigen/src/Cholesky/LLT.h @@ -24,7 +24,7 @@ template struct LLT_Traits; * * \tparam _MatrixType the type of the matrix of which we are computing the LL^T Cholesky decomposition * \tparam _UpLo the triangular part that will be used for the decompositon: Lower (default) or Upper. - * The other triangular part won't be read. + * The other triangular part won't be read. * * This class performs a LL^T Cholesky decomposition of a symmetric, positive definite * matrix A such that A = LL^* = U^*U, where L is lower triangular. @@ -41,14 +41,18 @@ template struct LLT_Traits; * Example: \include LLT_example.cpp * Output: \verbinclude LLT_example.out * + * \b Performance: for best performance, it is recommended to use a column-major storage format + * with the Lower triangular part (the default), or, equivalently, a row-major storage format + * with the Upper triangular part. Otherwise, you might get a 20% slowdown for the full factorization + * step, and rank-updates can be up to 3 times slower. + * * This class supports the \link InplaceDecomposition inplace decomposition \endlink mechanism. * + * Note that during the decomposition, only the lower (or upper, as defined by _UpLo) triangular part of A is considered. + * Therefore, the strict lower part does not have to store correct values. + * * \sa MatrixBase::llt(), SelfAdjointView::llt(), class LDLT */ - /* HEY THIS DOX IS DISABLED BECAUSE THERE's A BUG EITHER HERE OR IN LDLT ABOUT THAT (OR BOTH) - * Note that during the decomposition, only the upper triangular part of A is considered. Therefore, - * the strict lower part does not have to store correct values. - */ template class LLT { public: @@ -146,7 +150,7 @@ template class LLT } template - void solveInPlace(MatrixBase &bAndX) const; + void solveInPlace(const MatrixBase &bAndX) const; template LLT& compute(const EigenBase& matrix); @@ -177,7 +181,7 @@ template class LLT /** \brief Reports whether previous computation was successful. * * \returns \c Success if computation was succesful, - * \c NumericalIssue if the matrix.appears to be negative. + * \c NumericalIssue if the matrix.appears not to be positive definite. */ ComputationInfo info() const { @@ -425,7 +429,8 @@ LLT& LLT::compute(const EigenBase eigen_assert(a.rows()==a.cols()); const Index size = a.rows(); m_matrix.resize(size, size); - m_matrix = a.derived(); + if (!internal::is_same_dense(m_matrix, a.derived())) + m_matrix = a.derived(); // Compute matrix L1 norm = max abs column sum. m_l1_norm = RealScalar(0); @@ -485,11 +490,14 @@ void LLT<_MatrixType,_UpLo>::_solve_impl(const RhsType &rhs, DstType &dst) const * * This version avoids a copy when the right hand side matrix b is not needed anymore. * + * \warning The parameter is only marked 'const' to make the C++ compiler accept a temporary expression here. + * This function will const_cast it, so constness isn't honored here. + * * \sa LLT::solve(), MatrixBase::llt() */ template template -void LLT::solveInPlace(MatrixBase &bAndX) const +void LLT::solveInPlace(const MatrixBase &bAndX) const { eigen_assert(m_isInitialized && "LLT is not initialized."); eigen_assert(m_matrix.rows()==bAndX.rows()); diff --git a/eigenlib/Eigen/src/Core/Array.h b/eigenlib/Eigen/src/Core/Array.h index 0d34269f..16770fc7 100644 --- a/eigenlib/Eigen/src/Core/Array.h +++ b/eigenlib/Eigen/src/Core/Array.h @@ -153,8 +153,6 @@ class Array : Base(std::move(other)) { Base::_check_template_params(); - if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic) - Base::_set_noalias(other); } EIGEN_DEVICE_FUNC Array& operator=(Array&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) @@ -231,10 +229,16 @@ class Array : Base(other) { } + private: + struct PrivateType {}; + public: + /** \sa MatrixBase::operator=(const EigenBase&) */ template EIGEN_DEVICE_FUNC - EIGEN_STRONG_INLINE Array(const EigenBase &other) + EIGEN_STRONG_INLINE Array(const EigenBase &other, + typename internal::enable_if::value, + PrivateType>::type = PrivateType()) : Base(other.derived()) { } diff --git a/eigenlib/Eigen/src/Core/ArrayBase.h b/eigenlib/Eigen/src/Core/ArrayBase.h index f0232f65..33f644e2 100644 --- a/eigenlib/Eigen/src/Core/ArrayBase.h +++ b/eigenlib/Eigen/src/Core/ArrayBase.h @@ -153,8 +153,8 @@ template class ArrayBase // inline void evalTo(Dest& dst) const { dst = matrix(); } protected: - EIGEN_DEVICE_FUNC - ArrayBase() : Base() {} + EIGEN_DEFAULT_COPY_CONSTRUCTOR(ArrayBase) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(ArrayBase) private: explicit ArrayBase(Index); @@ -175,7 +175,7 @@ template class ArrayBase */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & ArrayBase::operator-=(const ArrayBase &other) { call_assignment(derived(), other.derived(), internal::sub_assign_op()); @@ -188,7 +188,7 @@ ArrayBase::operator-=(const ArrayBase &other) */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & ArrayBase::operator+=(const ArrayBase& other) { call_assignment(derived(), other.derived(), internal::add_assign_op()); @@ -201,7 +201,7 @@ ArrayBase::operator+=(const ArrayBase& other) */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & ArrayBase::operator*=(const ArrayBase& other) { call_assignment(derived(), other.derived(), internal::mul_assign_op()); @@ -214,7 +214,7 @@ ArrayBase::operator*=(const ArrayBase& other) */ template template -EIGEN_STRONG_INLINE Derived & +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived & ArrayBase::operator/=(const ArrayBase& other) { call_assignment(derived(), other.derived(), internal::div_assign_op()); diff --git a/eigenlib/Eigen/src/Core/ArrayWrapper.h b/eigenlib/Eigen/src/Core/ArrayWrapper.h index a04521a1..688aadd6 100644 --- a/eigenlib/Eigen/src/Core/ArrayWrapper.h +++ b/eigenlib/Eigen/src/Core/ArrayWrapper.h @@ -32,7 +32,8 @@ struct traits > // Let's remove NestByRefBit enum { Flags0 = traits::type >::Flags, - Flags = Flags0 & ~NestByRefBit + LvalueBitFlag = is_lvalue::value ? LvalueBit : 0, + Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag }; }; } @@ -129,7 +130,8 @@ struct traits > // Let's remove NestByRefBit enum { Flags0 = traits::type >::Flags, - Flags = Flags0 & ~NestByRefBit + LvalueBitFlag = is_lvalue::value ? LvalueBit : 0, + Flags = (Flags0 & ~(NestByRefBit | LvalueBit)) | LvalueBitFlag }; }; } diff --git a/eigenlib/Eigen/src/Core/AssignEvaluator.h b/eigenlib/Eigen/src/Core/AssignEvaluator.h index 14400d24..dbe435d8 100644 --- a/eigenlib/Eigen/src/Core/AssignEvaluator.h +++ b/eigenlib/Eigen/src/Core/AssignEvaluator.h @@ -39,7 +39,7 @@ public: enum { DstAlignment = DstEvaluator::Alignment, SrcAlignment = SrcEvaluator::Alignment, - DstHasDirectAccess = DstFlags & DirectAccessBit, + DstHasDirectAccess = (DstFlags & DirectAccessBit) == DirectAccessBit, JointAlignment = EIGEN_PLAIN_ENUM_MIN(DstAlignment,SrcAlignment) }; @@ -83,7 +83,7 @@ private: && int(OuterStride)!=Dynamic && int(OuterStride)%int(InnerPacketSize)==0 && (EIGEN_UNALIGNED_VECTORIZE || int(JointAlignment)>=int(InnerRequiredAlignment)), MayLinearize = bool(StorageOrdersAgree) && (int(DstFlags) & int(SrcFlags) & LinearAccessBit), - MayLinearVectorize = bool(MightVectorize) && MayLinearize && DstHasDirectAccess + MayLinearVectorize = bool(MightVectorize) && bool(MayLinearize) && bool(DstHasDirectAccess) && (EIGEN_UNALIGNED_VECTORIZE || (int(DstAlignment)>=int(LinearRequiredAlignment)) || MaxSizeAtCompileTime == Dynamic), /* If the destination isn't aligned, we have to do runtime checks and we don't unroll, so it's only good for large enough sizes. */ @@ -515,7 +515,7 @@ struct dense_assignment_loop template struct dense_assignment_loop { - EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel) + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) { typedef typename Kernel::Scalar Scalar; typedef typename Kernel::PacketType PacketType; @@ -563,7 +563,7 @@ struct dense_assignment_loop template struct dense_assignment_loop { - EIGEN_DEVICE_FUNC static inline void run(Kernel &kernel) + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE void run(Kernel &kernel) { typedef typename Kernel::DstEvaluatorType::XprType DstXprType; typedef typename Kernel::PacketType PacketType; @@ -701,6 +701,26 @@ protected: * Part 5 : Entry point for dense rectangular assignment ***************************************************************************/ +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const Functor &/*func*/) +{ + EIGEN_ONLY_USED_FOR_DEBUG(dst); + EIGEN_ONLY_USED_FOR_DEBUG(src); + eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); +} + +template +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE +void resize_if_allowed(DstXprType &dst, const SrcXprType& src, const internal::assign_op &/*func*/) +{ + Index dstRows = src.rows(); + Index dstCols = src.cols(); + if(((dst.rows()!=dstRows) || (dst.cols()!=dstCols))) + dst.resize(dstRows, dstCols); + eigen_assert(dst.rows() == dstRows && dst.cols() == dstCols); +} + template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType& dst, const SrcXprType& src, const Functor &func) { @@ -711,10 +731,7 @@ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void call_dense_assignment_loop(DstXprType // NOTE To properly handle A = (A*A.transpose())/s with A rectangular, // we need to resize the destination after the source evaluator has been created. - Index dstRows = src.rows(); - Index dstCols = src.cols(); - if((dst.rows()!=dstRows) || (dst.cols()!=dstCols)) - dst.resize(dstRows, dstCols); + resize_if_allowed(dst, src, func); DstEvaluatorType dstEvaluator(dst); diff --git a/eigenlib/Eigen/src/Core/Assign_MKL.h b/eigenlib/Eigen/src/Core/Assign_MKL.h index 6c2ab926..6866095b 100755 --- a/eigenlib/Eigen/src/Core/Assign_MKL.h +++ b/eigenlib/Eigen/src/Core/Assign_MKL.h @@ -84,7 +84,8 @@ class vml_assign_traits struct Assignment, SrcXprNested>, assign_op, \ Dense2Dense, typename enable_if::EnableVml>::type> { \ typedef CwiseUnaryOp, SrcXprNested> SrcXprType; \ - static void run(DstXprType &dst, const SrcXprType &src, const assign_op &/*func*/) { \ + static void run(DstXprType &dst, const SrcXprType &src, const assign_op &func) { \ + resize_if_allowed(dst, src, func); \ eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ if(vml_assign_traits::Traversal==LinearTraversal) { \ VMLOP(dst.size(), (const VMLTYPE*)src.nestedExpression().data(), \ @@ -144,7 +145,8 @@ EIGEN_MKL_VML_DECLARE_UNARY_CALLS_REAL(ceil, Ceil, _) Dense2Dense, typename enable_if::EnableVml>::type> { \ typedef CwiseBinaryOp, SrcXprNested, \ const CwiseNullaryOp,Plain> > SrcXprType; \ - static void run(DstXprType &dst, const SrcXprType &src, const assign_op &/*func*/) { \ + static void run(DstXprType &dst, const SrcXprType &src, const assign_op &func) { \ + resize_if_allowed(dst, src, func); \ eigen_assert(dst.rows() == src.rows() && dst.cols() == src.cols()); \ VMLTYPE exponent = reinterpret_cast(src.rhs().functor().m_other); \ if(vml_assign_traits::Traversal==LinearTraversal) \ diff --git a/eigenlib/Eigen/src/Core/ConditionEstimator.h b/eigenlib/Eigen/src/Core/ConditionEstimator.h index aa7efdc7..51a2e5f1 100644 --- a/eigenlib/Eigen/src/Core/ConditionEstimator.h +++ b/eigenlib/Eigen/src/Core/ConditionEstimator.h @@ -160,7 +160,7 @@ rcond_estimate_helper(typename Decomposition::RealScalar matrix_norm, const Deco { typedef typename Decomposition::RealScalar RealScalar; eigen_assert(dec.rows() == dec.cols()); - if (dec.rows() == 0) return RealScalar(1); + if (dec.rows() == 0) return NumTraits::infinity(); if (matrix_norm == RealScalar(0)) return RealScalar(0); if (dec.rows() == 1) return RealScalar(1); const RealScalar inverse_matrix_norm = rcond_invmatrix_L1_norm_estimate(dec); diff --git a/eigenlib/Eigen/src/Core/CoreEvaluators.h b/eigenlib/Eigen/src/Core/CoreEvaluators.h index 1d14af65..910889ef 100644 --- a/eigenlib/Eigen/src/Core/CoreEvaluators.h +++ b/eigenlib/Eigen/src/Core/CoreEvaluators.h @@ -977,7 +977,7 @@ struct evaluator > OuterStrideAtCompileTime = HasSameStorageOrderAsArgType ? int(outer_stride_at_compile_time::ret) : int(inner_stride_at_compile_time::ret), - MaskPacketAccessBit = (InnerStrideAtCompileTime == 1) ? PacketAccessBit : 0, + MaskPacketAccessBit = (InnerStrideAtCompileTime == 1 || HasSameStorageOrderAsArgType) ? PacketAccessBit : 0, FlagsLinearAccessBit = (RowsAtCompileTime == 1 || ColsAtCompileTime == 1 || (InnerPanel && (evaluator::Flags&LinearAccessBit))) ? LinearAccessBit : 0, FlagsRowMajorBit = XprType::Flags&RowMajorBit, @@ -987,7 +987,9 @@ struct evaluator > Flags = Flags0 | FlagsLinearAccessBit | FlagsRowMajorBit, PacketAlignment = unpacket_traits::alignment, - Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0, + Alignment0 = (InnerPanel && (OuterStrideAtCompileTime!=Dynamic) + && (OuterStrideAtCompileTime!=0) + && (((OuterStrideAtCompileTime * int(sizeof(Scalar))) % int(PacketAlignment)) == 0)) ? int(PacketAlignment) : 0, Alignment = EIGEN_PLAIN_ENUM_MIN(evaluator::Alignment, Alignment0) }; typedef block_evaluator block_evaluator_type; @@ -1018,14 +1020,16 @@ struct unary_evaluator, IndexBa EIGEN_DEVICE_FUNC explicit unary_evaluator(const XprType& block) : m_argImpl(block.nestedExpression()), m_startRow(block.startRow()), - m_startCol(block.startCol()) + m_startCol(block.startCol()), + m_linear_offset(InnerPanel?(XprType::IsRowMajor ? block.startRow()*block.cols() : block.startCol()*block.rows()):0) { } typedef typename XprType::Scalar Scalar; typedef typename XprType::CoeffReturnType CoeffReturnType; enum { - RowsAtCompileTime = XprType::RowsAtCompileTime + RowsAtCompileTime = XprType::RowsAtCompileTime, + ForwardLinearAccess = InnerPanel && bool(evaluator::Flags&LinearAccessBit) }; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -1037,7 +1041,10 @@ struct unary_evaluator, IndexBa EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const { - return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); + if (ForwardLinearAccess) + return m_argImpl.coeff(m_linear_offset.value() + index); + else + return coeff(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); } EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE @@ -1049,7 +1056,10 @@ struct unary_evaluator, IndexBa EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index index) { - return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); + if (ForwardLinearAccess) + return m_argImpl.coeffRef(m_linear_offset.value() + index); + else + return coeffRef(RowsAtCompileTime == 1 ? 0 : index, RowsAtCompileTime == 1 ? index : 0); } template @@ -1063,8 +1073,11 @@ struct unary_evaluator, IndexBa EIGEN_STRONG_INLINE PacketType packet(Index index) const { - return packet(RowsAtCompileTime == 1 ? 0 : index, - RowsAtCompileTime == 1 ? index : 0); + if (ForwardLinearAccess) + return m_argImpl.template packet(m_linear_offset.value() + index); + else + return packet(RowsAtCompileTime == 1 ? 0 : index, + RowsAtCompileTime == 1 ? index : 0); } template @@ -1078,15 +1091,19 @@ struct unary_evaluator, IndexBa EIGEN_STRONG_INLINE void writePacket(Index index, const PacketType& x) { - return writePacket(RowsAtCompileTime == 1 ? 0 : index, - RowsAtCompileTime == 1 ? index : 0, - x); + if (ForwardLinearAccess) + return m_argImpl.template writePacket(m_linear_offset.value() + index, x); + else + return writePacket(RowsAtCompileTime == 1 ? 0 : index, + RowsAtCompileTime == 1 ? index : 0, + x); } protected: evaluator m_argImpl; const variable_if_dynamic m_startRow; const variable_if_dynamic m_startCol; + const variable_if_dynamic m_linear_offset; }; // TODO: This evaluator does not actually use the child evaluator; @@ -1556,9 +1573,7 @@ struct evaluator > { } typedef typename XprType::Scalar Scalar; - // FIXME having to check whether ArgType is sparse here i not very nice. - typedef typename internal::conditional::value, - typename XprType::CoeffReturnType,Scalar>::type CoeffReturnType; + typedef typename XprType::CoeffReturnType CoeffReturnType; EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index row, Index) const diff --git a/eigenlib/Eigen/src/Core/CwiseNullaryOp.h b/eigenlib/Eigen/src/Core/CwiseNullaryOp.h index dd498f75..ddd607e3 100644 --- a/eigenlib/Eigen/src/Core/CwiseNullaryOp.h +++ b/eigenlib/Eigen/src/Core/CwiseNullaryOp.h @@ -105,7 +105,7 @@ class CwiseNullaryOp : public internal::dense_xpr_base< CwiseNullaryOp template -EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> DenseBase::NullaryExpr(Index rows, Index cols, const CustomNullaryOp& func) { return CwiseNullaryOp(rows, cols, func); @@ -150,7 +150,7 @@ DenseBase::NullaryExpr(Index size, const CustomNullaryOp& func) */ template template -EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const CwiseNullaryOp::PlainObject> DenseBase::NullaryExpr(const CustomNullaryOp& func) { return CwiseNullaryOp(RowsAtCompileTime, ColsAtCompileTime, func); @@ -192,7 +192,7 @@ DenseBase::Constant(Index rows, Index cols, const Scalar& value) * \sa class CwiseNullaryOp */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Constant(Index size, const Scalar& value) { return DenseBase::NullaryExpr(size, internal::scalar_constant_op(value)); @@ -208,7 +208,7 @@ DenseBase::Constant(Index size, const Scalar& value) * \sa class CwiseNullaryOp */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Constant(const Scalar& value) { EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) @@ -220,7 +220,7 @@ DenseBase::Constant(const Scalar& value) * \sa LinSpaced(Index,Scalar,Scalar), setLinSpaced(Index,const Scalar&,const Scalar&) */ template -EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(Sequential_t, Index size, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -232,7 +232,7 @@ DenseBase::LinSpaced(Sequential_t, Index size, const Scalar& low, const * \sa LinSpaced(Scalar,Scalar) */ template -EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(Sequential_t, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -264,7 +264,7 @@ DenseBase::LinSpaced(Sequential_t, const Scalar& low, const Scalar& hig * \sa setLinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp */ template -EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(Index size, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -276,7 +276,7 @@ DenseBase::LinSpaced(Index size, const Scalar& low, const Scalar& high) * Special version for fixed size types which does not require the size parameter. */ template -EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::RandomAccessLinSpacedReturnType DenseBase::LinSpaced(const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) @@ -286,7 +286,7 @@ DenseBase::LinSpaced(const Scalar& low, const Scalar& high) /** \returns true if all coefficients in this matrix are approximately equal to \a val, to within precision \a prec */ template -bool DenseBase::isApproxToConstant +EIGEN_DEVICE_FUNC bool DenseBase::isApproxToConstant (const Scalar& val, const RealScalar& prec) const { typename internal::nested_eval::type self(derived()); @@ -301,7 +301,7 @@ bool DenseBase::isApproxToConstant * * \returns true if all coefficients in this matrix are approximately equal to \a value, to within precision \a prec */ template -bool DenseBase::isConstant +EIGEN_DEVICE_FUNC bool DenseBase::isConstant (const Scalar& val, const RealScalar& prec) const { return isApproxToConstant(val, prec); @@ -312,7 +312,7 @@ bool DenseBase::isConstant * \sa setConstant(), Constant(), class CwiseNullaryOp */ template -EIGEN_STRONG_INLINE void DenseBase::fill(const Scalar& val) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void DenseBase::fill(const Scalar& val) { setConstant(val); } @@ -322,7 +322,7 @@ EIGEN_STRONG_INLINE void DenseBase::fill(const Scalar& val) * \sa fill(), setConstant(Index,const Scalar&), setConstant(Index,Index,const Scalar&), setZero(), setOnes(), Constant(), class CwiseNullaryOp, setZero(), setOnes() */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setConstant(const Scalar& val) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setConstant(const Scalar& val) { return derived() = Constant(rows(), cols(), val); } @@ -337,7 +337,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setConstant(const Scalar& val) * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&) */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setConstant(Index size, const Scalar& val) { resize(size); @@ -356,7 +356,7 @@ PlainObjectBase::setConstant(Index size, const Scalar& val) * \sa MatrixBase::setConstant(const Scalar&), setConstant(Index,const Scalar&), class CwiseNullaryOp, MatrixBase::Constant(const Scalar&) */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setConstant(Index rows, Index cols, const Scalar& val) { resize(rows, cols); @@ -380,7 +380,7 @@ PlainObjectBase::setConstant(Index rows, Index cols, const Scalar& val) * \sa LinSpaced(Index,const Scalar&,const Scalar&), CwiseNullaryOp */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(Index newSize, const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return derived() = Derived::NullaryExpr(newSize, internal::linspaced_op(low,high,newSize)); @@ -400,7 +400,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(Index newSize, con * \sa LinSpaced(Index,const Scalar&,const Scalar&), setLinSpaced(Index, const Scalar&, const Scalar&), CwiseNullaryOp */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(const Scalar& low, const Scalar& high) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(const Scalar& low, const Scalar& high) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return setLinSpaced(size(), low, high); @@ -423,7 +423,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setLinSpaced(const Scalar& low, * \sa Zero(), Zero(Index) */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Zero(Index rows, Index cols) { return Constant(rows, cols, Scalar(0)); @@ -446,7 +446,7 @@ DenseBase::Zero(Index rows, Index cols) * \sa Zero(), Zero(Index,Index) */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Zero(Index size) { return Constant(size, Scalar(0)); @@ -463,7 +463,7 @@ DenseBase::Zero(Index size) * \sa Zero(Index), Zero(Index,Index) */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Zero() { return Constant(Scalar(0)); @@ -478,7 +478,7 @@ DenseBase::Zero() * \sa class CwiseNullaryOp, Zero() */ template -bool DenseBase::isZero(const RealScalar& prec) const +EIGEN_DEVICE_FUNC bool DenseBase::isZero(const RealScalar& prec) const { typename internal::nested_eval::type self(derived()); for(Index j = 0; j < cols(); ++j) @@ -496,7 +496,7 @@ bool DenseBase::isZero(const RealScalar& prec) const * \sa class CwiseNullaryOp, Zero() */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setZero() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setZero() { return setConstant(Scalar(0)); } @@ -511,7 +511,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setZero() * \sa DenseBase::setZero(), setZero(Index,Index), class CwiseNullaryOp, DenseBase::Zero() */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setZero(Index newSize) { resize(newSize); @@ -529,7 +529,7 @@ PlainObjectBase::setZero(Index newSize) * \sa DenseBase::setZero(), setZero(Index), class CwiseNullaryOp, DenseBase::Zero() */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setZero(Index rows, Index cols) { resize(rows, cols); @@ -553,7 +553,7 @@ PlainObjectBase::setZero(Index rows, Index cols) * \sa Ones(), Ones(Index), isOnes(), class Ones */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Ones(Index rows, Index cols) { return Constant(rows, cols, Scalar(1)); @@ -576,7 +576,7 @@ DenseBase::Ones(Index rows, Index cols) * \sa Ones(), Ones(Index,Index), isOnes(), class Ones */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Ones(Index newSize) { return Constant(newSize, Scalar(1)); @@ -593,7 +593,7 @@ DenseBase::Ones(Index newSize) * \sa Ones(Index), Ones(Index,Index), isOnes(), class Ones */ template -EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename DenseBase::ConstantReturnType DenseBase::Ones() { return Constant(Scalar(1)); @@ -608,7 +608,7 @@ DenseBase::Ones() * \sa class CwiseNullaryOp, Ones() */ template -bool DenseBase::isOnes +EIGEN_DEVICE_FUNC bool DenseBase::isOnes (const RealScalar& prec) const { return isApproxToConstant(Scalar(1), prec); @@ -622,7 +622,7 @@ bool DenseBase::isOnes * \sa class CwiseNullaryOp, Ones() */ template -EIGEN_STRONG_INLINE Derived& DenseBase::setOnes() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::setOnes() { return setConstant(Scalar(1)); } @@ -637,7 +637,7 @@ EIGEN_STRONG_INLINE Derived& DenseBase::setOnes() * \sa MatrixBase::setOnes(), setOnes(Index,Index), class CwiseNullaryOp, MatrixBase::Ones() */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setOnes(Index newSize) { resize(newSize); @@ -655,7 +655,7 @@ PlainObjectBase::setOnes(Index newSize) * \sa MatrixBase::setOnes(), setOnes(Index), class CwiseNullaryOp, MatrixBase::Ones() */ template -EIGEN_STRONG_INLINE Derived& +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& PlainObjectBase::setOnes(Index rows, Index cols) { resize(rows, cols); @@ -679,7 +679,7 @@ PlainObjectBase::setOnes(Index rows, Index cols) * \sa Identity(), setIdentity(), isIdentity() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::IdentityReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::IdentityReturnType MatrixBase::Identity(Index rows, Index cols) { return DenseBase::NullaryExpr(rows, cols, internal::scalar_identity_op()); @@ -696,7 +696,7 @@ MatrixBase::Identity(Index rows, Index cols) * \sa Identity(Index,Index), setIdentity(), isIdentity() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::IdentityReturnType +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::IdentityReturnType MatrixBase::Identity() { EIGEN_STATIC_ASSERT_FIXED_SIZE(Derived) @@ -771,7 +771,7 @@ struct setIdentity_impl * \sa class CwiseNullaryOp, Identity(), Identity(Index,Index), isIdentity() */ template -EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity() { return internal::setIdentity_impl::run(derived()); } @@ -787,7 +787,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity() * \sa MatrixBase::setIdentity(), class CwiseNullaryOp, MatrixBase::Identity() */ template -EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity(Index rows, Index cols) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity(Index rows, Index cols) { derived().resize(rows, cols); return setIdentity(); @@ -800,7 +800,7 @@ EIGEN_STRONG_INLINE Derived& MatrixBase::setIdentity(Index rows, Index * \sa MatrixBase::Unit(Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::Unit(Index newSize, Index i) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::Unit(Index newSize, Index i) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return BasisReturnType(SquareMatrixType::Identity(newSize,newSize), i); @@ -815,7 +815,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::UnitX(), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::Unit(Index i) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::Unit(Index i) { EIGEN_STATIC_ASSERT_VECTOR_ONLY(Derived) return BasisReturnType(SquareMatrixType::Identity(),i); @@ -828,7 +828,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitX() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitX() { return Derived::Unit(0); } /** \returns an expression of the Y axis unit vector (0,1{,0}^*) @@ -838,7 +838,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitY() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitY() { return Derived::Unit(1); } /** \returns an expression of the Z axis unit vector (0,0,1{,0}^*) @@ -848,7 +848,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitZ() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitZ() { return Derived::Unit(2); } /** \returns an expression of the W axis unit vector (0,0,0,1) @@ -858,7 +858,7 @@ EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBa * \sa MatrixBase::Unit(Index,Index), MatrixBase::Unit(Index), MatrixBase::UnitY(), MatrixBase::UnitZ(), MatrixBase::UnitW() */ template -EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitW() +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const typename MatrixBase::BasisReturnType MatrixBase::UnitW() { return Derived::Unit(3); } } // end namespace Eigen diff --git a/eigenlib/Eigen/src/Core/CwiseUnaryView.h b/eigenlib/Eigen/src/Core/CwiseUnaryView.h index 27103305..5a30fa8d 100644 --- a/eigenlib/Eigen/src/Core/CwiseUnaryView.h +++ b/eigenlib/Eigen/src/Core/CwiseUnaryView.h @@ -121,6 +121,8 @@ class CwiseUnaryViewImpl { return derived().nestedExpression().outerStride() * sizeof(typename internal::traits::Scalar) / sizeof(Scalar); } + protected: + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(CwiseUnaryViewImpl) }; } // end namespace Eigen diff --git a/eigenlib/Eigen/src/Core/DenseBase.h b/eigenlib/Eigen/src/Core/DenseBase.h index bd74e8a1..c55a6823 100644 --- a/eigenlib/Eigen/src/Core/DenseBase.h +++ b/eigenlib/Eigen/src/Core/DenseBase.h @@ -40,7 +40,7 @@ static inline void check_DenseIndex_is_signed() { */ template class DenseBase #ifndef EIGEN_PARSED_BY_DOXYGEN - : public DenseCoeffsBase + : public DenseCoeffsBase::value> #else : public DenseCoeffsBase #endif // not EIGEN_PARSED_BY_DOXYGEN @@ -71,7 +71,7 @@ template class DenseBase typedef Scalar value_type; typedef typename NumTraits::Real RealScalar; - typedef DenseCoeffsBase Base; + typedef DenseCoeffsBase::value> Base; using Base::derived; using Base::const_cast_derived; @@ -296,7 +296,7 @@ template class DenseBase EIGEN_DEVICE_FUNC Derived& operator=(const ReturnByValue& func); - /** \ínternal + /** \internal * Copies \a other into *this without evaluating other. \returns a reference to *this. * \deprecated */ template @@ -463,7 +463,17 @@ template class DenseBase EIGEN_DEVICE_FUNC void visit(Visitor& func) const; - inline const WithFormat format(const IOFormat& fmt) const; + /** \returns a WithFormat proxy object allowing to print a matrix the with given + * format \a fmt. + * + * See class IOFormat for some examples. + * + * \sa class IOFormat, class WithFormat + */ + inline const WithFormat format(const IOFormat& fmt) const + { + return WithFormat(derived(), fmt); + } /** \returns the unique coefficient of a 1x1 expression */ EIGEN_DEVICE_FUNC @@ -474,9 +484,9 @@ template class DenseBase return derived().coeff(0,0); } - bool all() const; - bool any() const; - Index count() const; + EIGEN_DEVICE_FUNC bool all() const; + EIGEN_DEVICE_FUNC bool any() const; + EIGEN_DEVICE_FUNC Index count() const; typedef VectorwiseOp RowwiseReturnType; typedef const VectorwiseOp ConstRowwiseReturnType; @@ -577,11 +587,12 @@ template class DenseBase } protected: + EIGEN_DEFAULT_COPY_CONSTRUCTOR(DenseBase) /** Default constructor. Do nothing. */ EIGEN_DEVICE_FUNC DenseBase() { /* Just checks for self-consistency of the flags. - * Only do it when debugging Eigen, as this borders on paranoiac and could slow compilation down + * Only do it when debugging Eigen, as this borders on paranoia and could slow compilation down */ #ifdef EIGEN_INTERNAL_DEBUGGING EIGEN_STATIC_ASSERT((EIGEN_IMPLIES(MaxRowsAtCompileTime==1 && MaxColsAtCompileTime!=1, int(IsRowMajor)) diff --git a/eigenlib/Eigen/src/Core/DenseStorage.h b/eigenlib/Eigen/src/Core/DenseStorage.h index 82201d96..7d6d4e66 100644 --- a/eigenlib/Eigen/src/Core/DenseStorage.h +++ b/eigenlib/Eigen/src/Core/DenseStorage.h @@ -13,9 +13,9 @@ #define EIGEN_MATRIXSTORAGE_H #ifdef EIGEN_DENSE_STORAGE_CTOR_PLUGIN - #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN EIGEN_DENSE_STORAGE_CTOR_PLUGIN; + #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X) X; EIGEN_DENSE_STORAGE_CTOR_PLUGIN; #else - #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + #define EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(X) #endif namespace Eigen { @@ -184,12 +184,16 @@ template class DenseSt { internal::plain_array m_data; public: - EIGEN_DEVICE_FUNC DenseStorage() {} + EIGEN_DEVICE_FUNC DenseStorage() { + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size) + } EIGEN_DEVICE_FUNC explicit DenseStorage(internal::constructor_without_unaligned_array_assert) : m_data(internal::constructor_without_unaligned_array_assert()) {} EIGEN_DEVICE_FUNC - DenseStorage(const DenseStorage& other) : m_data(other.m_data) {} + DenseStorage(const DenseStorage& other) : m_data(other.m_data) { + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = Size) + } EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) { @@ -197,7 +201,7 @@ template class DenseSt return *this; } EIGEN_DEVICE_FUNC DenseStorage(Index size, Index rows, Index cols) { - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) eigen_internal_assert(size==rows*cols && rows==_Rows && cols==_Cols); EIGEN_UNUSED_VARIABLE(size); EIGEN_UNUSED_VARIABLE(rows); @@ -343,7 +347,7 @@ template class DenseStorage(size)), m_rows(rows), m_cols(cols) { - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) eigen_internal_assert(size==rows*cols && rows>=0 && cols >=0); } EIGEN_DEVICE_FUNC DenseStorage(const DenseStorage& other) @@ -351,6 +355,7 @@ template class DenseStorage class DenseStorage(m_data, m_rows*m_cols); - if (size) + if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative m_data = internal::conditional_aligned_new_auto(size); else m_data = 0; - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) } m_rows = rows; m_cols = cols; @@ -422,7 +427,7 @@ template class DenseStorage(size)), m_cols(cols) { - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) eigen_internal_assert(size==rows*cols && rows==_Rows && cols >=0); EIGEN_UNUSED_VARIABLE(rows); } @@ -430,6 +435,7 @@ template class DenseStorage(_Rows*other.m_cols)) , m_cols(other.m_cols) { + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_cols*_Rows) internal::smart_copy(other.m_data, other.m_data+_Rows*m_cols, m_data); } EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) @@ -473,11 +479,11 @@ template class DenseStorage(m_data, _Rows*m_cols); - if (size) + if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative m_data = internal::conditional_aligned_new_auto(size); else m_data = 0; - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) } m_cols = cols; } @@ -495,7 +501,7 @@ template class DenseStorage(size)), m_rows(rows) { - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) eigen_internal_assert(size==rows*cols && rows>=0 && cols == _Cols); EIGEN_UNUSED_VARIABLE(cols); } @@ -503,6 +509,7 @@ template class DenseStorage(other.m_rows*_Cols)) , m_rows(other.m_rows) { + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN(Index size = m_rows*_Cols) internal::smart_copy(other.m_data, other.m_data+other.m_rows*_Cols, m_data); } EIGEN_DEVICE_FUNC DenseStorage& operator=(const DenseStorage& other) @@ -546,11 +553,11 @@ template class DenseStorage(m_data, _Cols*m_rows); - if (size) + if (size>0) // >0 and not simply !=0 to let the compiler knows that size cannot be negative m_data = internal::conditional_aligned_new_auto(size); else m_data = 0; - EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN + EIGEN_INTERNAL_DENSE_STORAGE_CTOR_PLUGIN({}) } m_rows = rows; } diff --git a/eigenlib/Eigen/src/Core/Diagonal.h b/eigenlib/Eigen/src/Core/Diagonal.h index bfea0584..afcaf357 100644 --- a/eigenlib/Eigen/src/Core/Diagonal.h +++ b/eigenlib/Eigen/src/Core/Diagonal.h @@ -21,7 +21,7 @@ namespace Eigen { * \param MatrixType the type of the object in which we are taking a sub/main/super diagonal * \param DiagIndex the index of the sub/super diagonal. The default is 0 and it means the main diagonal. * A positive value means a superdiagonal, a negative value means a subdiagonal. - * You can also use Dynamic so the index can be set at runtime. + * You can also use DynamicIndex so the index can be set at runtime. * * The matrix is not required to be square. * @@ -70,7 +70,10 @@ template class Diagonal EIGEN_DENSE_PUBLIC_INTERFACE(Diagonal) EIGEN_DEVICE_FUNC - explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) {} + explicit inline Diagonal(MatrixType& matrix, Index a_index = DiagIndex) : m_matrix(matrix), m_index(a_index) + { + eigen_assert( a_index <= m_matrix.cols() && -a_index <= m_matrix.rows() ); + } EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Diagonal) diff --git a/eigenlib/Eigen/src/Core/Dot.h b/eigenlib/Eigen/src/Core/Dot.h index 06ef18b8..1fe7a84a 100644 --- a/eigenlib/Eigen/src/Core/Dot.h +++ b/eigenlib/Eigen/src/Core/Dot.h @@ -31,7 +31,8 @@ struct dot_nocheck typedef scalar_conj_product_op::Scalar,typename traits::Scalar> conj_prod; typedef typename conj_prod::result_type ResScalar; EIGEN_DEVICE_FUNC - static inline ResScalar run(const MatrixBase& a, const MatrixBase& b) + EIGEN_STRONG_INLINE + static ResScalar run(const MatrixBase& a, const MatrixBase& b) { return a.template binaryExpr(b).sum(); } @@ -43,7 +44,8 @@ struct dot_nocheck typedef scalar_conj_product_op::Scalar,typename traits::Scalar> conj_prod; typedef typename conj_prod::result_type ResScalar; EIGEN_DEVICE_FUNC - static inline ResScalar run(const MatrixBase& a, const MatrixBase& b) + EIGEN_STRONG_INLINE + static ResScalar run(const MatrixBase& a, const MatrixBase& b) { return a.transpose().template binaryExpr(b).sum(); } @@ -65,6 +67,7 @@ struct dot_nocheck template template EIGEN_DEVICE_FUNC +EIGEN_STRONG_INLINE typename ScalarBinaryOpTraits::Scalar,typename internal::traits::Scalar>::ReturnType MatrixBase::dot(const MatrixBase& other) const { @@ -102,7 +105,7 @@ EIGEN_STRONG_INLINE typename NumTraits::Scala * \sa lpNorm(), dot(), squaredNorm() */ template -inline typename NumTraits::Scalar>::Real MatrixBase::norm() const +EIGEN_STRONG_INLINE typename NumTraits::Scalar>::Real MatrixBase::norm() const { return numext::sqrt(squaredNorm()); } @@ -117,7 +120,7 @@ inline typename NumTraits::Scalar>::Real Matr * \sa norm(), normalize() */ template -inline const typename MatrixBase::PlainObject +EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject MatrixBase::normalized() const { typedef typename internal::nested_eval::type _Nested; @@ -139,7 +142,7 @@ MatrixBase::normalized() const * \sa norm(), normalized() */ template -inline void MatrixBase::normalize() +EIGEN_STRONG_INLINE void MatrixBase::normalize() { RealScalar z = squaredNorm(); // NOTE: after extensive benchmarking, this conditional does not impact performance, at least on recent x86 CPU @@ -160,7 +163,7 @@ inline void MatrixBase::normalize() * \sa stableNorm(), stableNormalize(), normalized() */ template -inline const typename MatrixBase::PlainObject +EIGEN_STRONG_INLINE const typename MatrixBase::PlainObject MatrixBase::stableNormalized() const { typedef typename internal::nested_eval::type _Nested; @@ -185,7 +188,7 @@ MatrixBase::stableNormalized() const * \sa stableNorm(), stableNormalized(), normalize() */ template -inline void MatrixBase::stableNormalize() +EIGEN_STRONG_INLINE void MatrixBase::stableNormalize() { RealScalar w = cwiseAbs().maxCoeff(); RealScalar z = (derived()/w).squaredNorm(); diff --git a/eigenlib/Eigen/src/Core/EigenBase.h b/eigenlib/Eigen/src/Core/EigenBase.h index f76995af..b195506a 100644 --- a/eigenlib/Eigen/src/Core/EigenBase.h +++ b/eigenlib/Eigen/src/Core/EigenBase.h @@ -14,6 +14,7 @@ namespace Eigen { /** \class EigenBase + * \ingroup Core_Module * * Common base class for all classes T such that MatrixBase has an operator=(T) and a constructor MatrixBase(T). * @@ -128,6 +129,7 @@ template struct EigenBase */ template template +EIGEN_DEVICE_FUNC Derived& DenseBase::operator=(const EigenBase &other) { call_assignment(derived(), other.derived()); @@ -136,6 +138,7 @@ Derived& DenseBase::operator=(const EigenBase &other) template template +EIGEN_DEVICE_FUNC Derived& DenseBase::operator+=(const EigenBase &other) { call_assignment(derived(), other.derived(), internal::add_assign_op()); @@ -144,6 +147,7 @@ Derived& DenseBase::operator+=(const EigenBase &other) template template +EIGEN_DEVICE_FUNC Derived& DenseBase::operator-=(const EigenBase &other) { call_assignment(derived(), other.derived(), internal::sub_assign_op()); diff --git a/eigenlib/Eigen/src/Core/GeneralProduct.h b/eigenlib/Eigen/src/Core/GeneralProduct.h index 0f16cd8e..6f0cc80e 100644 --- a/eigenlib/Eigen/src/Core/GeneralProduct.h +++ b/eigenlib/Eigen/src/Core/GeneralProduct.h @@ -24,12 +24,17 @@ template struct product_type_selector; template struct product_size_category { - enum { is_large = MaxSize == Dynamic || - Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || - (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD), - value = is_large ? Large - : Size == 1 ? 1 - : Small + enum { + #ifndef EIGEN_CUDA_ARCH + is_large = MaxSize == Dynamic || + Size >= EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD || + (Size==Dynamic && MaxSize>=EIGEN_CACHEFRIENDLY_PRODUCT_THRESHOLD), + #else + is_large = 0, + #endif + value = is_large ? Large + : Size == 1 ? 1 + : Small }; }; @@ -379,8 +384,6 @@ template<> struct gemv_dense_selector * * \sa lazyProduct(), operator*=(const MatrixBase&), Cwise::operator*() */ -#ifndef __CUDACC__ - template template inline const Product @@ -412,8 +415,6 @@ MatrixBase::operator*(const MatrixBase &other) const return Product(derived(), other.derived()); } -#endif // __CUDACC__ - /** \returns an expression of the matrix product of \c *this and \a other without implicit evaluation. * * The returned product will behave like any other expressions: the coefficients of the product will be diff --git a/eigenlib/Eigen/src/Core/GenericPacketMath.h b/eigenlib/Eigen/src/Core/GenericPacketMath.h index 27033a2d..e5944377 100644 --- a/eigenlib/Eigen/src/Core/GenericPacketMath.h +++ b/eigenlib/Eigen/src/Core/GenericPacketMath.h @@ -230,7 +230,7 @@ pload1(const typename unpacket_traits::type *a) { return pset1( * duplicated to form: {from[0],from[0],from[1],from[1],from[2],from[2],from[3],from[3]} * Currently, this function is only used for scalar * complex products. */ -template EIGEN_DEVICE_FUNC inline Packet +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet ploaddup(const typename unpacket_traits::type* from) { return *from; } /** \internal \returns a packet with elements of \a *from quadrupled. @@ -278,7 +278,7 @@ inline void pbroadcast2(const typename unpacket_traits::type *a, } /** \internal \brief Returns a packet with coefficients (a,a+1,...,a+packet_size-1). */ -template inline Packet +template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet plset(const typename unpacket_traits::type& a) { return a; } /** \internal copy the packet \a from to \a *to, \a to must be 16 bytes aligned */ @@ -351,10 +351,7 @@ template EIGEN_DEVICE_FUNC inline Packet preverse(const Packet& /** \internal \returns \a a with real and imaginary part flipped (for complex type only) */ template EIGEN_DEVICE_FUNC inline Packet pcplxflip(const Packet& a) { - // FIXME: uncomment the following in case we drop the internal imag and real functions. -// using std::imag; -// using std::real; - return Packet(imag(a),real(a)); + return Packet(a.imag(),a.real()); } /************************** @@ -482,7 +479,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void pstoret(Scalar* to, const Packet& fro * by the current computation. */ template -inline Packet ploadt_ro(const typename unpacket_traits::type* from) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet ploadt_ro(const typename unpacket_traits::type* from) { return ploadt(from); } @@ -524,10 +521,10 @@ inline void palign(PacketType& first, const PacketType& second) #ifndef __CUDACC__ template<> inline std::complex pmul(const std::complex& a, const std::complex& b) -{ return std::complex(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } +{ return std::complex(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); } template<> inline std::complex pmul(const std::complex& a, const std::complex& b) -{ return std::complex(real(a)*real(b) - imag(a)*imag(b), imag(a)*real(b) + real(a)*imag(b)); } +{ return std::complex(a.real()*b.real() - a.imag()*b.imag(), a.imag()*b.real() + a.real()*b.imag()); } #endif diff --git a/eigenlib/Eigen/src/Core/IO.h b/eigenlib/Eigen/src/Core/IO.h index 644228c3..da7fd6cc 100644 --- a/eigenlib/Eigen/src/Core/IO.h +++ b/eigenlib/Eigen/src/Core/IO.h @@ -109,20 +109,6 @@ class WithFormat IOFormat m_format; }; -/** \returns a WithFormat proxy object allowing to print a matrix the with given - * format \a fmt. - * - * See class IOFormat for some examples. - * - * \sa class IOFormat, class WithFormat - */ -template -inline const WithFormat -DenseBase::format(const IOFormat& fmt) const -{ - return WithFormat(derived(), fmt); -} - namespace internal { // NOTE: This helper is kept for backward compatibility with previous code specializing diff --git a/eigenlib/Eigen/src/Core/Map.h b/eigenlib/Eigen/src/Core/Map.h index 06d19670..548bf9a2 100644 --- a/eigenlib/Eigen/src/Core/Map.h +++ b/eigenlib/Eigen/src/Core/Map.h @@ -20,11 +20,17 @@ struct traits > { typedef traits TraitsBase; enum { + PlainObjectTypeInnerSize = ((traits::Flags&RowMajorBit)==RowMajorBit) + ? PlainObjectType::ColsAtCompileTime + : PlainObjectType::RowsAtCompileTime, + InnerStrideAtCompileTime = StrideType::InnerStrideAtCompileTime == 0 ? int(PlainObjectType::InnerStrideAtCompileTime) : int(StrideType::InnerStrideAtCompileTime), OuterStrideAtCompileTime = StrideType::OuterStrideAtCompileTime == 0 - ? int(PlainObjectType::OuterStrideAtCompileTime) + ? (InnerStrideAtCompileTime==Dynamic || PlainObjectTypeInnerSize==Dynamic + ? Dynamic + : int(InnerStrideAtCompileTime) * int(PlainObjectTypeInnerSize)) : int(StrideType::OuterStrideAtCompileTime), Alignment = int(MapOptions)&int(AlignedMask), Flags0 = TraitsBase::Flags & (~NestByRefBit), @@ -107,10 +113,11 @@ template class Ma EIGEN_DEVICE_FUNC inline Index outerStride() const { - return StrideType::OuterStrideAtCompileTime != 0 ? m_stride.outer() - : IsVectorAtCompileTime ? this->size() - : int(Flags)&RowMajorBit ? this->cols() - : this->rows(); + return int(StrideType::OuterStrideAtCompileTime) != 0 ? m_stride.outer() + : int(internal::traits::OuterStrideAtCompileTime) != Dynamic ? Index(internal::traits::OuterStrideAtCompileTime) + : IsVectorAtCompileTime ? (this->size() * innerStride()) + : (int(Flags)&RowMajorBit) ? (this->cols() * innerStride()) + : (this->rows() * innerStride()); } /** Constructor in the fixed-size case. diff --git a/eigenlib/Eigen/src/Core/MapBase.h b/eigenlib/Eigen/src/Core/MapBase.h index 020f939a..92c3b281 100644 --- a/eigenlib/Eigen/src/Core/MapBase.h +++ b/eigenlib/Eigen/src/Core/MapBase.h @@ -43,6 +43,7 @@ template class MapBase enum { RowsAtCompileTime = internal::traits::RowsAtCompileTime, ColsAtCompileTime = internal::traits::ColsAtCompileTime, + InnerStrideAtCompileTime = internal::traits::InnerStrideAtCompileTime, SizeAtCompileTime = Base::SizeAtCompileTime }; @@ -181,14 +182,19 @@ template class MapBase #endif protected: + EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase) template EIGEN_DEVICE_FUNC void checkSanity(typename internal::enable_if<(internal::traits::Alignment>0),void*>::type = 0) const { #if EIGEN_MAX_ALIGN_BYTES>0 + // innerStride() is not set yet when this function is called, so we optimistically assume the lowest plausible value: + const Index minInnerStride = InnerStrideAtCompileTime == Dynamic ? 1 : Index(InnerStrideAtCompileTime); + EIGEN_ONLY_USED_FOR_DEBUG(minInnerStride); eigen_assert(( ((internal::UIntPtr(m_data) % internal::traits::Alignment) == 0) - || (cols() * rows() * innerStride() * sizeof(Scalar)) < internal::traits::Alignment ) && "data is not aligned"); + || (cols() * rows() * minInnerStride * sizeof(Scalar)) < internal::traits::Alignment ) && "data is not aligned"); #endif } @@ -290,6 +296,9 @@ template class MapBase // In theory we could simply refer to Base:Base::operator=, but MSVC does not like Base::Base, // see bugs 821 and 920. using ReadOnlyMapBase::Base::operator=; + protected: + EIGEN_DEFAULT_COPY_CONSTRUCTOR(MapBase) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MapBase) }; #undef EIGEN_STATIC_ASSERT_INDEX_BASED_ACCESS diff --git a/eigenlib/Eigen/src/Core/MathFunctions.h b/eigenlib/Eigen/src/Core/MathFunctions.h index 8d47fb8a..01736c2a 100644 --- a/eigenlib/Eigen/src/Core/MathFunctions.h +++ b/eigenlib/Eigen/src/Core/MathFunctions.h @@ -287,7 +287,7 @@ struct abs2_impl_default // IsComplex EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) { - return real(x)*real(x) + imag(x)*imag(x); + return x.real()*x.real() + x.imag()*x.imag(); } }; @@ -313,14 +313,17 @@ struct abs2_retval ****************************************************************************/ template -struct norm1_default_impl +struct norm1_default_impl; + +template +struct norm1_default_impl { typedef typename NumTraits::Real RealScalar; EIGEN_DEVICE_FUNC static inline RealScalar run(const Scalar& x) { EIGEN_USING_STD_MATH(abs); - return abs(real(x)) + abs(imag(x)); + return abs(x.real()) + abs(x.imag()); } }; @@ -348,31 +351,7 @@ struct norm1_retval * Implementation of hypot * ****************************************************************************/ -template -struct hypot_impl -{ - typedef typename NumTraits::Real RealScalar; - static inline RealScalar run(const Scalar& x, const Scalar& y) - { - EIGEN_USING_STD_MATH(abs); - EIGEN_USING_STD_MATH(sqrt); - RealScalar _x = abs(x); - RealScalar _y = abs(y); - Scalar p, qp; - if(_x>_y) - { - p = _x; - qp = _y / p; - } - else - { - p = _y; - qp = _x / p; - } - if(p==RealScalar(0)) return RealScalar(0); - return p * sqrt(RealScalar(1) + qp*qp); - } -}; +template struct hypot_impl; template struct hypot_retval @@ -495,7 +474,7 @@ namespace std_fallback { typedef typename NumTraits::Real RealScalar; EIGEN_USING_STD_MATH(log); Scalar x1p = RealScalar(1) + x; - return ( x1p == Scalar(1) ) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) ); + return numext::equal_strict(x1p, Scalar(1)) ? x : x * ( log(x1p) / (x1p - RealScalar(1)) ); } } @@ -640,21 +619,28 @@ template struct random_default_impl { static inline Scalar run(const Scalar& x, const Scalar& y) - { - typedef typename conditional::IsSigned,std::ptrdiff_t,std::size_t>::type ScalarX; - if(y=x the result converted to an unsigned long is still correct. - std::size_t range = ScalarX(y)-ScalarX(x); - std::size_t offset = 0; - // rejection sampling - std::size_t divisor = 1; - std::size_t multiplier = 1; - if(range::type ScalarU; + // ScalarX is the widest of ScalarU and unsigned int. + // We'll deal only with ScalarX and unsigned int below thus avoiding signed + // types and arithmetic and signed overflows (which are undefined behavior). + typedef typename conditional<(ScalarU(-1) > unsigned(-1)), ScalarU, unsigned>::type ScalarX; + // The following difference doesn't overflow, provided our integer types are two's + // complement and have the same number of padding bits in signed and unsigned variants. + // This is the case in most modern implementations of C++. + ScalarX range = ScalarX(y) - ScalarX(x); + ScalarX offset = 0; + ScalarX divisor = 1; + ScalarX multiplier = 1; + const unsigned rand_max = RAND_MAX; + if (range <= rand_max) divisor = (rand_max + 1) / (range + 1); + else multiplier = 1 + range / (rand_max + 1); + // Rejection sampling. do { - offset = (std::size_t(std::rand()) * multiplier) / divisor; + offset = (unsigned(std::rand()) * multiplier) / divisor; } while (offset > range); return Scalar(ScalarX(x) + offset); } @@ -679,8 +665,8 @@ struct random_default_impl { static inline Scalar run(const Scalar& x, const Scalar& y) { - return Scalar(random(real(x), real(y)), - random(imag(x), imag(y))); + return Scalar(random(x.real(), y.real()), + random(x.imag(), y.imag())); } static inline Scalar run() { @@ -933,6 +919,9 @@ inline EIGEN_MATHFUNC_RETVAL(abs2, Scalar) abs2(const Scalar& x) return EIGEN_MATHFUNC_IMPL(abs2, Scalar)::run(x); } +EIGEN_DEVICE_FUNC +inline bool abs2(bool x) { return x; } + template EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(norm1, Scalar) norm1(const Scalar& x) @@ -1030,7 +1019,8 @@ inline int log2(int x) /** \returns the square root of \a x. * - * It is essentially equivalent to \code using std::sqrt; return sqrt(x); \endcode, + * It is essentially equivalent to + * \code using std::sqrt; return sqrt(x); \endcode * but slightly faster for float/double and some compilers (e.g., gcc), thanks to * specializations when SSE is enabled. * @@ -1061,11 +1051,24 @@ double log(const double &x) { return ::log(x); } template EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -typename NumTraits::Real abs(const T &x) { +typename internal::enable_if::IsSigned || NumTraits::IsComplex,typename NumTraits::Real>::type +abs(const T &x) { EIGEN_USING_STD_MATH(abs); return abs(x); } +template +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE +typename internal::enable_if::IsSigned || NumTraits::IsComplex),typename NumTraits::Real>::type +abs(const T &x) { + return x; +} + +#if defined(__SYCL_DEVICE_ONLY__) +EIGEN_ALWAYS_INLINE float abs(float x) { return cl::sycl::fabs(x); } +EIGEN_ALWAYS_INLINE double abs(double x) { return cl::sycl::fabs(x); } +#endif // defined(__SYCL_DEVICE_ONLY__) + #ifdef __CUDACC__ template<> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE float abs(const float &x) { return ::fabsf(x); } diff --git a/eigenlib/Eigen/src/Core/MathFunctionsImpl.h b/eigenlib/Eigen/src/Core/MathFunctionsImpl.h index 3c9ef22f..9c1ceb0e 100644 --- a/eigenlib/Eigen/src/Core/MathFunctionsImpl.h +++ b/eigenlib/Eigen/src/Core/MathFunctionsImpl.h @@ -71,6 +71,29 @@ T generic_fast_tanh_float(const T& a_x) return pdiv(p, q); } +template +EIGEN_STRONG_INLINE +RealScalar positive_real_hypot(const RealScalar& x, const RealScalar& y) +{ + EIGEN_USING_STD_MATH(sqrt); + RealScalar p, qp; + p = numext::maxi(x,y); + if(p==RealScalar(0)) return RealScalar(0); + qp = numext::mini(y,x) / p; + return p * sqrt(RealScalar(1) + qp*qp); +} + +template +struct hypot_impl +{ + typedef typename NumTraits::Real RealScalar; + static inline RealScalar run(const Scalar& x, const Scalar& y) + { + EIGEN_USING_STD_MATH(abs); + return positive_real_hypot(abs(x), abs(y)); + } +}; + } // end namespace internal } // end namespace Eigen diff --git a/eigenlib/Eigen/src/Core/Matrix.h b/eigenlib/Eigen/src/Core/Matrix.h index 90c336d8..7f4a7af9 100644 --- a/eigenlib/Eigen/src/Core/Matrix.h +++ b/eigenlib/Eigen/src/Core/Matrix.h @@ -274,8 +274,6 @@ class Matrix : Base(std::move(other)) { Base::_check_template_params(); - if (RowsAtCompileTime!=Dynamic && ColsAtCompileTime!=Dynamic) - Base::_set_noalias(other); } EIGEN_DEVICE_FUNC Matrix& operator=(Matrix&& other) EIGEN_NOEXCEPT_IF(std::is_nothrow_move_assignable::value) diff --git a/eigenlib/Eigen/src/Core/MatrixBase.h b/eigenlib/Eigen/src/Core/MatrixBase.h index f7cf04cd..f8bcc8c6 100644 --- a/eigenlib/Eigen/src/Core/MatrixBase.h +++ b/eigenlib/Eigen/src/Core/MatrixBase.h @@ -160,20 +160,11 @@ template class MatrixBase EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& operator-=(const MatrixBase& other); -#ifdef __CUDACC__ template EIGEN_DEVICE_FUNC - const Product - operator*(const MatrixBase &other) const - { return this->lazyProduct(other); } -#else - - template const Product operator*(const MatrixBase &other) const; -#endif - template EIGEN_DEVICE_FUNC const Product @@ -294,7 +285,7 @@ template class MatrixBase * fuzzy comparison such as isApprox() * \sa isApprox(), operator!= */ template - inline bool operator==(const MatrixBase& other) const + EIGEN_DEVICE_FUNC inline bool operator==(const MatrixBase& other) const { return cwiseEqual(other).all(); } /** \returns true if at least one pair of coefficients of \c *this and \a other are not exactly equal to each other. @@ -302,7 +293,7 @@ template class MatrixBase * fuzzy comparison such as isApprox() * \sa isApprox(), operator== */ template - inline bool operator!=(const MatrixBase& other) const + EIGEN_DEVICE_FUNC inline bool operator!=(const MatrixBase& other) const { return cwiseNotEqual(other).any(); } NoAlias noalias(); @@ -453,19 +444,28 @@ template class MatrixBase ///////// MatrixFunctions module ///////// typedef typename internal::stem_function::type StemFunction; - const MatrixExponentialReturnValue exp() const; +#define EIGEN_MATRIX_FUNCTION(ReturnType, Name, Description) \ + /** \returns an expression of the matrix Description of \c *this. \brief This function requires the unsupported MatrixFunctions module. To compute the coefficient-wise Description use ArrayBase::##Name . */ \ + const ReturnType Name() const; +#define EIGEN_MATRIX_FUNCTION_1(ReturnType, Name, Description, Argument) \ + /** \returns an expression of the matrix Description of \c *this. \brief This function requires the unsupported MatrixFunctions module. To compute the coefficient-wise Description use ArrayBase::##Name . */ \ + const ReturnType Name(Argument) const; + + EIGEN_MATRIX_FUNCTION(MatrixExponentialReturnValue, exp, exponential) + /** \brief Helper function for the unsupported MatrixFunctions module.*/ const MatrixFunctionReturnValue matrixFunction(StemFunction f) const; - const MatrixFunctionReturnValue cosh() const; - const MatrixFunctionReturnValue sinh() const; - const MatrixFunctionReturnValue cos() const; - const MatrixFunctionReturnValue sin() const; - const MatrixSquareRootReturnValue sqrt() const; - const MatrixLogarithmReturnValue log() const; - const MatrixPowerReturnValue pow(const RealScalar& p) const; - const MatrixComplexPowerReturnValue pow(const std::complex& p) const; + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cosh, hyperbolic cosine) + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sinh, hyperbolic sine) + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, cos, cosine) + EIGEN_MATRIX_FUNCTION(MatrixFunctionReturnValue, sin, sine) + EIGEN_MATRIX_FUNCTION(MatrixSquareRootReturnValue, sqrt, square root) + EIGEN_MATRIX_FUNCTION(MatrixLogarithmReturnValue, log, logarithm) + EIGEN_MATRIX_FUNCTION_1(MatrixPowerReturnValue, pow, power to \c p, const RealScalar& p) + EIGEN_MATRIX_FUNCTION_1(MatrixComplexPowerReturnValue, pow, power to \c p, const std::complex& p) protected: - EIGEN_DEVICE_FUNC MatrixBase() : Base() {} + EIGEN_DEFAULT_COPY_CONSTRUCTOR(MatrixBase) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(MatrixBase) private: EIGEN_DEVICE_FUNC explicit MatrixBase(int); diff --git a/eigenlib/Eigen/src/Core/NumTraits.h b/eigenlib/Eigen/src/Core/NumTraits.h index dd61195b..daf48987 100644 --- a/eigenlib/Eigen/src/Core/NumTraits.h +++ b/eigenlib/Eigen/src/Core/NumTraits.h @@ -215,6 +215,8 @@ struct NumTraits > static inline RealScalar epsilon() { return NumTraits::epsilon(); } EIGEN_DEVICE_FUNC static inline RealScalar dummy_precision() { return NumTraits::dummy_precision(); } + + static inline int digits10() { return NumTraits::digits10(); } }; template<> struct NumTraits diff --git a/eigenlib/Eigen/src/Core/PermutationMatrix.h b/eigenlib/Eigen/src/Core/PermutationMatrix.h index b1fb455b..47c06ba7 100644 --- a/eigenlib/Eigen/src/Core/PermutationMatrix.h +++ b/eigenlib/Eigen/src/Core/PermutationMatrix.h @@ -87,17 +87,6 @@ class PermutationBase : public EigenBase return derived(); } - #ifndef EIGEN_PARSED_BY_DOXYGEN - /** This is a special case of the templated operator=. Its purpose is to - * prevent a default operator= from hiding the templated operator=. - */ - Derived& operator=(const PermutationBase& other) - { - indices() = other.indices(); - return derived(); - } - #endif - /** \returns the number of rows */ inline Index rows() const { return Index(indices().size()); } @@ -333,12 +322,6 @@ class PermutationMatrix : public PermutationBase& other) : m_indices(other.indices()) {} - #ifndef EIGEN_PARSED_BY_DOXYGEN - /** Standard copy constructor. Defined only to prevent a default copy constructor - * from hiding the other templated constructor */ - inline PermutationMatrix(const PermutationMatrix& other) : m_indices(other.indices()) {} - #endif - /** Generic constructor from expression of the indices. The indices * array has the meaning that the permutations sends each integer i to indices[i]. * @@ -373,17 +356,6 @@ class PermutationMatrix : public PermutationBase struct check_rows_cols_for_overflow { { // http://hg.mozilla.org/mozilla-central/file/6c8a909977d3/xpcom/ds/CheckedInt.h#l242 // we assume Index is signed - Index max_index = (size_t(1) << (8 * sizeof(Index) - 1)) - 1; // assume Index is signed + Index max_index = (std::size_t(1) << (8 * sizeof(Index) - 1)) - 1; // assume Index is signed bool error = (rows == 0 || cols == 0) ? false : (rows > max_index / cols); if (error) @@ -577,6 +577,10 @@ class PlainObjectBase : public internal::dense_xpr_base::type * while the AlignedMap() functions return aligned Map objects and thus should be called only with 16-byte-aligned * \a data pointers. * + * Here is an example using strides: + * \include Matrix_Map_stride.cpp + * Output: \verbinclude Matrix_Map_stride.out + * * \see class Map */ //@{ @@ -733,8 +737,10 @@ class PlainObjectBase : public internal::dense_xpr_base::type EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init2(Index rows, Index cols, typename internal::enable_if::type* = 0) { - EIGEN_STATIC_ASSERT(bool(NumTraits::IsInteger) && - bool(NumTraits::IsInteger), + const bool t0_is_integer_alike = internal::is_valid_index_type::value; + const bool t1_is_integer_alike = internal::is_valid_index_type::value; + EIGEN_STATIC_ASSERT(t0_is_integer_alike && + t1_is_integer_alike, FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED) resize(rows,cols); } @@ -769,9 +775,9 @@ class PlainObjectBase : public internal::dense_xpr_base::type && ((!internal::is_same::XprKind,ArrayXpr>::value || Base::SizeAtCompileTime==Dynamic)),T>::type* = 0) { // NOTE MSVC 2008 complains if we directly put bool(NumTraits::IsInteger) as the EIGEN_STATIC_ASSERT argument. - const bool is_integer = NumTraits::IsInteger; - EIGEN_UNUSED_VARIABLE(is_integer); - EIGEN_STATIC_ASSERT(is_integer, + const bool is_integer_alike = internal::is_valid_index_type::value; + EIGEN_UNUSED_VARIABLE(is_integer_alike); + EIGEN_STATIC_ASSERT(is_integer_alike, FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED) resize(size); } @@ -812,6 +818,13 @@ class PlainObjectBase : public internal::dense_xpr_base::type this->_set_noalias(other); } + // Initialize an arbitrary matrix from an object convertible to the Derived type. + template + EIGEN_DEVICE_FUNC + EIGEN_STRONG_INLINE void _init1(const Derived& other){ + this->_set_noalias(other); + } + // Initialize an arbitrary matrix from a generic Eigen expression template EIGEN_DEVICE_FUNC @@ -834,7 +847,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type this->derived() = r; } - // For fixed -size arrays: + // For fixed-size Array template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const Scalar& val0, @@ -846,6 +859,7 @@ class PlainObjectBase : public internal::dense_xpr_base::type Base::setConstant(val0); } + // For fixed-size Array template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void _init1(const Index& val0, diff --git a/eigenlib/Eigen/src/Core/Product.h b/eigenlib/Eigen/src/Core/Product.h index ae0c94b3..676c4802 100644 --- a/eigenlib/Eigen/src/Core/Product.h +++ b/eigenlib/Eigen/src/Core/Product.h @@ -97,8 +97,8 @@ class Product : public ProductImpl<_Lhs,_Rhs,Option, && "if you wanted a coeff-wise or a dot product use the respective explicit functions"); } - EIGEN_DEVICE_FUNC inline Index rows() const { return m_lhs.rows(); } - EIGEN_DEVICE_FUNC inline Index cols() const { return m_rhs.cols(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index rows() const { return m_lhs.rows(); } + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index cols() const { return m_rhs.cols(); } EIGEN_DEVICE_FUNC const LhsNestedCleaned& lhs() const { return m_lhs; } EIGEN_DEVICE_FUNC const RhsNestedCleaned& rhs() const { return m_rhs; } @@ -127,7 +127,7 @@ public: using Base::derived; typedef typename Base::Scalar Scalar; - operator const Scalar() const + EIGEN_STRONG_INLINE operator const Scalar() const { return internal::evaluator(derived()).coeff(0,0); } @@ -162,7 +162,7 @@ class ProductImpl public: - EIGEN_DEVICE_FUNC Scalar coeff(Index row, Index col) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index row, Index col) const { EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS); eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) ); @@ -170,7 +170,7 @@ class ProductImpl return internal::evaluator(derived()).coeff(row,col); } - EIGEN_DEVICE_FUNC Scalar coeff(Index i) const + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar coeff(Index i) const { EIGEN_STATIC_ASSERT(EnableCoeff, THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS); eigen_assert( (Option==LazyProduct) || (this->rows() == 1 && this->cols() == 1) ); diff --git a/eigenlib/Eigen/src/Core/ProductEvaluators.h b/eigenlib/Eigen/src/Core/ProductEvaluators.h index 583b7f59..bce1310c 100644 --- a/eigenlib/Eigen/src/Core/ProductEvaluators.h +++ b/eigenlib/Eigen/src/Core/ProductEvaluators.h @@ -32,7 +32,7 @@ struct evaluator > typedef Product XprType; typedef product_evaluator Base; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) : Base(xpr) {} + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr) {} }; // Catch "scalar * ( A * B )" and transform it to "(A*scalar) * B" @@ -55,7 +55,7 @@ struct evaluator, const Product > XprType; typedef evaluator > Base; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(xpr.lhs().functor().m_other * xpr.rhs().lhs() * xpr.rhs().rhs()) {} }; @@ -68,7 +68,7 @@ struct evaluator, DiagIndex> > typedef Diagonal, DiagIndex> XprType; typedef evaluator, DiagIndex> > Base; - EIGEN_DEVICE_FUNC explicit evaluator(const XprType& xpr) + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit evaluator(const XprType& xpr) : Base(Diagonal, DiagIndex>( Product(xpr.nestedExpression().lhs(), xpr.nestedExpression().rhs()), xpr.index() )) @@ -207,6 +207,12 @@ struct evaluator_assume_aliasing +struct evaluator_assume_aliasing::Scalar>, const OtherXpr, + const Product >, DenseShape > { + static const bool value = true; +}; + template struct assignment_from_xpr_op_product { @@ -240,19 +246,19 @@ template struct generic_product_impl { template - static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) = (lhs.transpose().cwiseProduct(rhs)).sum(); } template - static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) += (lhs.transpose().cwiseProduct(rhs)).sum(); } template - static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { dst.coeffRef(0,0) -= (lhs.transpose().cwiseProduct(rhs)).sum(); } }; @@ -306,25 +312,25 @@ struct generic_product_impl }; template - static inline void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, set(), is_row_major()); } template - static inline void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, add(), is_row_major()); } template - static inline void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) + static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { internal::outer_product_selector_run(dst, lhs, rhs, sub(), is_row_major()); } template - static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) + static EIGEN_STRONG_INLINE void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) { internal::outer_product_selector_run(dst, lhs, rhs, adds(alpha), is_row_major()); } @@ -390,7 +396,7 @@ struct generic_product_impl // but easier on the compiler side call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::assign_op()); } - + template static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { @@ -404,6 +410,32 @@ struct generic_product_impl // dst.noalias() -= lhs.lazyProduct(rhs); call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op()); } + + // Catch "dst {,+,-}= (s*A)*B" and evaluate it lazily by moving out the scalar factor: + // dst {,+,-}= s * (A.lazyProduct(B)) + // This is a huge benefit for heap-allocated matrix types as it save one costly allocation. + // For them, this strategy is also faster than simply by-passing the heap allocation through + // stack allocation. + // For fixed sizes matrices, this is less obvious, it is sometimes x2 faster, but sometimes x3 slower, + // and the behavior depends also a lot on the compiler... so let's be conservative and enable them for dynamic-size only, + // that is when coming from generic_product_impl<...,GemmProduct> in file GeneralMatrixMatrix.h + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void eval_dynamic(Dst& dst, const CwiseBinaryOp, + const CwiseNullaryOp, Plain1>, Xpr2>& lhs, const Rhs& rhs, const Func &func) + { + call_assignment_no_alias(dst, lhs.lhs().functor().m_other * lhs.rhs().lazyProduct(rhs), func); + } + + // Here, we we always have LhsT==Lhs, but we need to make it a template type to make the above + // overload more specialized. + template + static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE + void eval_dynamic(Dst& dst, const LhsT& lhs, const Rhs& rhs, const Func &func) + { + call_assignment_no_alias(dst, lhs.lazyProduct(rhs), func); + } + // template // static inline void scaleAndAddTo(Dst& dst, const Lhs& lhs, const Rhs& rhs, const Scalar& alpha) @@ -779,7 +811,11 @@ public: _Vectorizable = bool(int(MatrixFlags)&PacketAccessBit) && _SameTypes && (_ScalarAccessOnDiag || (bool(int(DiagFlags)&PacketAccessBit))), _LinearAccessMask = (MatrixType::RowsAtCompileTime==1 || MatrixType::ColsAtCompileTime==1) ? LinearAccessBit : 0, Flags = ((HereditaryBits|_LinearAccessMask) & (unsigned int)(MatrixFlags)) | (_Vectorizable ? PacketAccessBit : 0), - Alignment = evaluator::Alignment + Alignment = evaluator::Alignment, + + AsScalarProduct = (DiagonalType::SizeAtCompileTime==1) + || (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::RowsAtCompileTime==1 && ProductOrder==OnTheLeft) + || (DiagonalType::SizeAtCompileTime==Dynamic && MatrixType::ColsAtCompileTime==1 && ProductOrder==OnTheRight) }; diagonal_product_evaluator_base(const MatrixType &mat, const DiagonalType &diag) @@ -791,7 +827,10 @@ public: EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar coeff(Index idx) const { - return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx); + if(AsScalarProduct) + return m_diagImpl.coeff(0) * m_matImpl.coeff(idx); + else + return m_diagImpl.coeff(idx) * m_matImpl.coeff(idx); } protected: diff --git a/eigenlib/Eigen/src/Core/Redux.h b/eigenlib/Eigen/src/Core/Redux.h index b6e8f888..760e9f86 100644 --- a/eigenlib/Eigen/src/Core/Redux.h +++ b/eigenlib/Eigen/src/Core/Redux.h @@ -407,7 +407,7 @@ protected: */ template template -typename internal::traits::Scalar +EIGEN_STRONG_INLINE typename internal::traits::Scalar DenseBase::redux(const Func& func) const { eigen_assert(this->rows()>0 && this->cols()>0 && "you are using an empty matrix"); diff --git a/eigenlib/Eigen/src/Core/Ref.h b/eigenlib/Eigen/src/Core/Ref.h index bdf24f52..17a1496b 100644 --- a/eigenlib/Eigen/src/Core/Ref.h +++ b/eigenlib/Eigen/src/Core/Ref.h @@ -28,12 +28,13 @@ struct traits > template struct match { enum { + IsVectorAtCompileTime = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime, HasDirectAccess = internal::has_direct_access::ret, - StorageOrderMatch = PlainObjectType::IsVectorAtCompileTime || Derived::IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)), + StorageOrderMatch = IsVectorAtCompileTime || ((PlainObjectType::Flags&RowMajorBit)==(Derived::Flags&RowMajorBit)), InnerStrideMatch = int(StrideType::InnerStrideAtCompileTime)==int(Dynamic) || int(StrideType::InnerStrideAtCompileTime)==int(Derived::InnerStrideAtCompileTime) || (int(StrideType::InnerStrideAtCompileTime)==0 && int(Derived::InnerStrideAtCompileTime)==1), - OuterStrideMatch = Derived::IsVectorAtCompileTime + OuterStrideMatch = IsVectorAtCompileTime || int(StrideType::OuterStrideAtCompileTime)==int(Dynamic) || int(StrideType::OuterStrideAtCompileTime)==int(Derived::OuterStrideAtCompileTime), // NOTE, this indirection of evaluator::Alignment is needed // to workaround a very strange bug in MSVC related to the instantiation @@ -95,6 +96,8 @@ protected: template EIGEN_DEVICE_FUNC void construct(Expression& expr) { + EIGEN_STATIC_ASSERT_SAME_MATRIX_SIZE(PlainObjectType,Expression); + if(PlainObjectType::RowsAtCompileTime==1) { eigen_assert(expr.rows()==1 || expr.cols()==1); diff --git a/eigenlib/Eigen/src/Core/SelfAdjointView.h b/eigenlib/Eigen/src/Core/SelfAdjointView.h index 504c98f0..b2e51f37 100644 --- a/eigenlib/Eigen/src/Core/SelfAdjointView.h +++ b/eigenlib/Eigen/src/Core/SelfAdjointView.h @@ -71,7 +71,9 @@ template class SelfAdjointView EIGEN_DEVICE_FUNC explicit inline SelfAdjointView(MatrixType& matrix) : m_matrix(matrix) - {} + { + EIGEN_STATIC_ASSERT(UpLo==Lower || UpLo==Upper,SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY); + } EIGEN_DEVICE_FUNC inline Index rows() const { return m_matrix.rows(); } @@ -189,7 +191,7 @@ template class SelfAdjointView TriangularView >::type(tmp2); } - typedef SelfAdjointView ConjugateReturnType; + typedef SelfAdjointView ConjugateReturnType; /** \sa MatrixBase::conjugate() const */ EIGEN_DEVICE_FUNC inline const ConjugateReturnType conjugate() const diff --git a/eigenlib/Eigen/src/Core/SelfCwiseBinaryOp.h b/eigenlib/Eigen/src/Core/SelfCwiseBinaryOp.h index 719ed72a..7c89c2e2 100644 --- a/eigenlib/Eigen/src/Core/SelfCwiseBinaryOp.h +++ b/eigenlib/Eigen/src/Core/SelfCwiseBinaryOp.h @@ -15,33 +15,29 @@ namespace Eigen { // TODO generalize the scalar type of 'other' template -EIGEN_STRONG_INLINE Derived& DenseBase::operator*=(const Scalar& other) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::operator*=(const Scalar& other) { - typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::mul_assign_op()); return derived(); } template -EIGEN_STRONG_INLINE Derived& ArrayBase::operator+=(const Scalar& other) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase::operator+=(const Scalar& other) { - typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::add_assign_op()); return derived(); } template -EIGEN_STRONG_INLINE Derived& ArrayBase::operator-=(const Scalar& other) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& ArrayBase::operator-=(const Scalar& other) { - typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::sub_assign_op()); return derived(); } template -EIGEN_STRONG_INLINE Derived& DenseBase::operator/=(const Scalar& other) +EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Derived& DenseBase::operator/=(const Scalar& other) { - typedef typename Derived::PlainObject PlainObject; internal::call_assignment(this->derived(), PlainObject::Constant(rows(),cols(),other), internal::div_assign_op()); return derived(); } diff --git a/eigenlib/Eigen/src/Core/Solve.h b/eigenlib/Eigen/src/Core/Solve.h index 960a5859..a8daea51 100644 --- a/eigenlib/Eigen/src/Core/Solve.h +++ b/eigenlib/Eigen/src/Core/Solve.h @@ -34,12 +34,12 @@ template struct s template struct solve_traits { - typedef Matrix PlainObject; + RhsType::MaxColsAtCompileTime>::type PlainObject; }; template diff --git a/eigenlib/Eigen/src/Core/SolveTriangular.h b/eigenlib/Eigen/src/Core/SolveTriangular.h index 049890b2..fd0acb1a 100644 --- a/eigenlib/Eigen/src/Core/SolveTriangular.h +++ b/eigenlib/Eigen/src/Core/SolveTriangular.h @@ -19,7 +19,7 @@ namespace internal { template struct triangular_solve_vector; -template +template struct triangular_solve_matrix; // small helper struct extracting some traits on the underlying solver operation @@ -98,8 +98,8 @@ struct triangular_solver_selector BlockingType blocking(rhs.rows(), rhs.cols(), size, 1, false); triangular_solve_matrix - ::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.outerStride(), blocking); + (Rhs::Flags&RowMajorBit) ? RowMajor : ColMajor, Rhs::InnerStrideAtCompileTime> + ::run(size, othersize, &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &rhs.coeffRef(0,0), rhs.innerStride(), rhs.outerStride(), blocking); } }; @@ -169,6 +169,9 @@ void TriangularViewImpl::solveInPlace(const MatrixBase::Flags & RowMajorBit) && OtherDerived::IsVectorAtCompileTime && OtherDerived::SizeAtCompileTime!=1}; typedef typename internal::conditional::stableNorm() const typedef typename internal::nested_eval::type DerivedCopy; typedef typename internal::remove_all::type DerivedCopyClean; - DerivedCopy copy(derived()); + const DerivedCopy copy(derived()); enum { CanAlign = ( (int(DerivedCopyClean::Flags)&DirectAccessBit) || (int(internal::evaluator::Alignment)>0) // FIXME Alignment)>0 might not be enough - ) && (blockSize*sizeof(Scalar)*20) // if we cannot allocate on the stack, then let's not bother about this optimization }; typedef typename internal::conditional, internal::evaluator::Alignment>, typename DerivedCopyClean::ConstSegmentReturnType>::type SegmentWrapper; diff --git a/eigenlib/Eigen/src/Core/Transpose.h b/eigenlib/Eigen/src/Core/Transpose.h index 79b767bc..960dc451 100644 --- a/eigenlib/Eigen/src/Core/Transpose.h +++ b/eigenlib/Eigen/src/Core/Transpose.h @@ -146,6 +146,8 @@ template class TransposeImpl { return derived().nestedExpression().coeffRef(index); } + protected: + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TransposeImpl) }; /** \returns an expression of the transpose of *this. diff --git a/eigenlib/Eigen/src/Core/Transpositions.h b/eigenlib/Eigen/src/Core/Transpositions.h index 19c17bb4..7718625e 100644 --- a/eigenlib/Eigen/src/Core/Transpositions.h +++ b/eigenlib/Eigen/src/Core/Transpositions.h @@ -33,17 +33,6 @@ class TranspositionsBase indices() = other.indices(); return derived(); } - - #ifndef EIGEN_PARSED_BY_DOXYGEN - /** This is a special case of the templated operator=. Its purpose is to - * prevent a default operator= from hiding the templated operator=. - */ - Derived& operator=(const TranspositionsBase& other) - { - indices() = other.indices(); - return derived(); - } - #endif /** \returns the number of transpositions */ Index size() const { return indices().size(); } @@ -171,12 +160,6 @@ class Transpositions : public TranspositionsBase& other) : m_indices(other.indices()) {} - #ifndef EIGEN_PARSED_BY_DOXYGEN - /** Standard copy constructor. Defined only to prevent a default copy constructor - * from hiding the other templated constructor */ - inline Transpositions(const Transpositions& other) : m_indices(other.indices()) {} - #endif - /** Generic constructor from expression of the transposition indices. */ template explicit inline Transpositions(const MatrixBase& indices) : m_indices(indices) @@ -189,17 +172,6 @@ class Transpositions : public TranspositionsBase > const Product operator*(const MatrixBase& matrix, const Transpose& trt) { - return Product(matrix.derived(), trt.derived()); + return Product(matrix.derived(), trt); } /** \returns the \a matrix with the inverse transpositions applied to the rows. diff --git a/eigenlib/Eigen/src/Core/TriangularMatrix.h b/eigenlib/Eigen/src/Core/TriangularMatrix.h index 667ef09d..9abb7e31 100644 --- a/eigenlib/Eigen/src/Core/TriangularMatrix.h +++ b/eigenlib/Eigen/src/Core/TriangularMatrix.h @@ -217,9 +217,7 @@ template class TriangularView explicit inline TriangularView(MatrixType& matrix) : m_matrix(matrix) {} - using Base::operator=; - TriangularView& operator=(const TriangularView &other) - { return Base::operator=(other); } + EIGEN_INHERIT_ASSIGNMENT_OPERATORS(TriangularView) /** \copydoc EigenBase::rows() */ EIGEN_DEVICE_FUNC @@ -544,6 +542,10 @@ template class TriangularViewImpl<_Mat template EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TriangularViewType& _assignProduct(const ProductType& prod, const Scalar& alpha, bool beta); + protected: + EIGEN_DEFAULT_COPY_CONSTRUCTOR(TriangularViewImpl) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(TriangularViewImpl) + }; /*************************************************************************** diff --git a/eigenlib/Eigen/src/Core/arch/AVX/Complex.h b/eigenlib/Eigen/src/Core/arch/AVX/Complex.h index 99439c8a..7fa61969 100644 --- a/eigenlib/Eigen/src/Core/arch/AVX/Complex.h +++ b/eigenlib/Eigen/src/Core/arch/AVX/Complex.h @@ -204,23 +204,7 @@ template<> struct conj_helper } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet8f& x, const Packet4cf& y, const Packet4cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet8f& x, const Packet4cf& y) const - { return Packet4cf(Eigen::internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet4cf pmadd(const Packet4cf& x, const Packet8f& y, const Packet4cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet4cf pmul(const Packet4cf& x, const Packet8f& y) const - { return Packet4cf(Eigen::internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet4cf,Packet8f) template<> EIGEN_STRONG_INLINE Packet4cf pdiv(const Packet4cf& a, const Packet4cf& b) { @@ -400,23 +384,7 @@ template<> struct conj_helper } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet4d& x, const Packet2cd& y, const Packet2cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet4d& x, const Packet2cd& y) const - { return Packet2cd(Eigen::internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cd pmadd(const Packet2cd& x, const Packet4d& y, const Packet2cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cd pmul(const Packet2cd& x, const Packet4d& y) const - { return Packet2cd(Eigen::internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cd,Packet4d) template<> EIGEN_STRONG_INLINE Packet2cd pdiv(const Packet2cd& a, const Packet2cd& b) { diff --git a/eigenlib/Eigen/src/Core/arch/AVX/PacketMath.h b/eigenlib/Eigen/src/Core/arch/AVX/PacketMath.h index 195d40fb..923a124b 100644 --- a/eigenlib/Eigen/src/Core/arch/AVX/PacketMath.h +++ b/eigenlib/Eigen/src/Core/arch/AVX/PacketMath.h @@ -159,11 +159,12 @@ template<> EIGEN_STRONG_INLINE Packet8i pdiv(const Packet8i& /*a*/, co #ifdef __FMA__ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& b, const Packet8f& c) { -#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) - // clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, - // and gcc stupidly generates a vfmadd132ps instruction, - // so let's enforce it to generate a vfmadd231ps instruction since the most common use case is to accumulate - // the result of the product. +#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) ) + // Clang stupidly generates a vfmadd213ps instruction plus some vmovaps on registers, + // and even register spilling with clang>=6.0 (bug 1637). + // Gcc stupidly generates a vfmadd132ps instruction. + // So let's enforce it to generate a vfmadd231ps instruction since the most common use + // case is to accumulate the result of the product. Packet8f res = c; __asm__("vfmadd231ps %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); return res; @@ -172,7 +173,7 @@ template<> EIGEN_STRONG_INLINE Packet8f pmadd(const Packet8f& a, const Packet8f& #endif } template<> EIGEN_STRONG_INLINE Packet4d pmadd(const Packet4d& a, const Packet4d& b, const Packet4d& c) { -#if ( EIGEN_COMP_GNUC_STRICT || (EIGEN_COMP_CLANG && (EIGEN_COMP_CLANG<308)) ) +#if ( (EIGEN_COMP_GNUC_STRICT && EIGEN_COMP_GNUC<80) || (EIGEN_COMP_CLANG) ) // see above Packet4d res = c; __asm__("vfmadd231pd %[a], %[b], %[c]" : [c] "+x" (res) : [a] "x" (a), [b] "x" (b)); @@ -308,9 +309,9 @@ template<> EIGEN_STRONG_INLINE void pstore1(int* to, const int& a) } #ifndef EIGEN_VECTORIZE_AVX512 -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } #endif template<> EIGEN_STRONG_INLINE float pfirst(const Packet8f& a) { @@ -333,9 +334,12 @@ template<> EIGEN_STRONG_INLINE Packet4d preverse(const Packet4d& a) { __m256d tmp = _mm256_shuffle_pd(a,a,5); return _mm256_permute2f128_pd(tmp, tmp, 1); - + #if 0 + // This version is unlikely to be faster as _mm256_shuffle_ps and _mm256_permute_pd + // exhibit the same latency/throughput, but it is here for future reference/benchmarking... __m256d swap_halves = _mm256_permute2f128_pd(a,a,1); return _mm256_permute_pd(swap_halves,5); + #endif } // pabs should be ok diff --git a/eigenlib/Eigen/src/Core/arch/AVX512/MathFunctions.h b/eigenlib/Eigen/src/Core/arch/AVX512/MathFunctions.h index 399be0ee..b259c1e1 100644 --- a/eigenlib/Eigen/src/Core/arch/AVX512/MathFunctions.h +++ b/eigenlib/Eigen/src/Core/arch/AVX512/MathFunctions.h @@ -29,6 +29,7 @@ namespace internal { #define _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(NAME, X) \ const Packet8d p8d_##NAME = _mm512_castsi512_pd(_mm512_set1_epi64(X)) + // Natural logarithm // Computes log(x) as log(2^e * m) = C*e + log(m), where the constant C =log(2) // and m is in the range [sqrt(1/2),sqrt(2)). In this range, the logarithm can @@ -47,6 +48,7 @@ plog(const Packet16f& _x) { // The smallest non denormalized float number. _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(min_norm_pos, 0x00800000); _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(minus_inf, 0xff800000); + _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(pos_inf, 0x7f800000); _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(nan, 0x7fc00000); // Polynomial coefficients. @@ -64,11 +66,9 @@ plog(const Packet16f& _x) { _EIGEN_DECLARE_CONST_Packet16f(cephes_log_q2, 0.693359375f); // invalid_mask is set to true when x is NaN - __mmask16 invalid_mask = - _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ); - __mmask16 iszero_mask = - _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_UQ); - + __mmask16 invalid_mask = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_NGE_UQ); + __mmask16 iszero_mask = _mm512_cmp_ps_mask(x, _mm512_setzero_ps(), _CMP_EQ_OQ); + // Truncate input values to the minimum positive normal. x = pmax(x, p16f_min_norm_pos); @@ -88,9 +88,9 @@ plog(const Packet16f& _x) { // x = x + x - 1.0; // } else { x = x - 1.0; } __mmask16 mask = _mm512_cmp_ps_mask(x, p16f_cephes_SQRTHF, _CMP_LT_OQ); - Packet16f tmp = _mm512_mask_blend_ps(mask, x, _mm512_setzero_ps()); + Packet16f tmp = _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), x); x = psub(x, p16f_1); - e = psub(e, _mm512_mask_blend_ps(mask, p16f_1, _mm512_setzero_ps())); + e = psub(e, _mm512_mask_blend_ps(mask, _mm512_setzero_ps(), p16f_1)); x = padd(x, tmp); Packet16f x2 = pmul(x, x); @@ -118,10 +118,18 @@ plog(const Packet16f& _x) { x = padd(x, y); x = padd(x, y2); - // Filter out invalid inputs, i.e. negative arg will be NAN, 0 will be -INF. - return _mm512_mask_blend_ps(iszero_mask, p16f_minus_inf, - _mm512_mask_blend_ps(invalid_mask, p16f_nan, x)); + __mmask16 pos_inf_mask = _mm512_cmp_ps_mask(_x,p16f_pos_inf,_CMP_EQ_OQ); + // Filter out invalid inputs, i.e.: + // - negative arg will be NAN, + // - 0 will be -INF. + // - +INF will be +INF + return _mm512_mask_blend_ps(iszero_mask, + _mm512_mask_blend_ps(invalid_mask, + _mm512_mask_blend_ps(pos_inf_mask,x,p16f_pos_inf), + p16f_nan), + p16f_minus_inf); } + #endif // Exponential function. Works by writing "x = m*log(2) + r" where @@ -257,50 +265,39 @@ pexp(const Packet8d& _x) { template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet16f psqrt(const Packet16f& _x) { - _EIGEN_DECLARE_CONST_Packet16f(one_point_five, 1.5f); - _EIGEN_DECLARE_CONST_Packet16f(minus_half, -0.5f); - _EIGEN_DECLARE_CONST_Packet16f_FROM_INT(flt_min, 0x00800000); + Packet16f neg_half = pmul(_x, pset1(-.5f)); + __mmask16 denormal_mask = _mm512_kand( + _mm512_cmp_ps_mask(_x, pset1((std::numeric_limits::min)()), + _CMP_LT_OQ), + _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_GE_OQ)); - Packet16f neg_half = pmul(_x, p16f_minus_half); - - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - __mmask16 non_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_GE_OQ); - Packet16f x = _mm512_mask_blend_ps(non_zero_mask, _mm512_rsqrt14_ps(_x), - _mm512_setzero_ps()); + Packet16f x = _mm512_rsqrt14_ps(_x); // Do a single step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); + x = pmul(x, pmadd(neg_half, pmul(x, x), pset1(1.5f))); - // Multiply the original _x by it's reciprocal square root to extract the - // square root. - return pmul(_x, x); + // Flush results for denormals to zero. + return _mm512_mask_blend_ps(denormal_mask, pmul(_x,x), _mm512_setzero_ps()); } template <> EIGEN_DEFINE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS EIGEN_UNUSED Packet8d psqrt(const Packet8d& _x) { - _EIGEN_DECLARE_CONST_Packet8d(one_point_five, 1.5); - _EIGEN_DECLARE_CONST_Packet8d(minus_half, -0.5); - _EIGEN_DECLARE_CONST_Packet8d_FROM_INT64(dbl_min, 0x0010000000000000LL); + Packet8d neg_half = pmul(_x, pset1(-.5)); + __mmask16 denormal_mask = _mm512_kand( + _mm512_cmp_pd_mask(_x, pset1((std::numeric_limits::min)()), + _CMP_LT_OQ), + _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_GE_OQ)); - Packet8d neg_half = pmul(_x, p8d_minus_half); + Packet8d x = _mm512_rsqrt14_pd(_x); - // select only the inverse sqrt of positive normal inputs (denormals are - // flushed to zero and cause infs as well). - __mmask8 non_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_GE_OQ); - Packet8d x = _mm512_mask_blend_pd(non_zero_mask, _mm512_rsqrt14_pd(_x), - _mm512_setzero_pd()); - - // Do a first step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); + // Do a single step of Newton's iteration. + x = pmul(x, pmadd(neg_half, pmul(x, x), pset1(1.5))); // Do a second step of Newton's iteration. - x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); + x = pmul(x, pmadd(neg_half, pmul(x, x), pset1(1.5))); - // Multiply the original _x by it's reciprocal square root to extract the - // square root. - return pmul(_x, x); + return _mm512_mask_blend_pd(denormal_mask, pmul(_x,x), _mm512_setzero_pd()); } #else template <> @@ -333,20 +330,18 @@ prsqrt(const Packet16f& _x) { // select only the inverse sqrt of positive normal inputs (denormals are // flushed to zero and cause infs as well). __mmask16 le_zero_mask = _mm512_cmp_ps_mask(_x, p16f_flt_min, _CMP_LT_OQ); - Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), - _mm512_rsqrt14_ps(_x)); + Packet16f x = _mm512_mask_blend_ps(le_zero_mask, _mm512_rsqrt14_ps(_x), _mm512_setzero_ps()); // Fill in NaNs and Infs for the negative/zero entries. __mmask16 neg_mask = _mm512_cmp_ps_mask(_x, _mm512_setzero_ps(), _CMP_LT_OQ); Packet16f infs_and_nans = _mm512_mask_blend_ps( - neg_mask, p16f_nan, - _mm512_mask_blend_ps(le_zero_mask, p16f_inf, _mm512_setzero_ps())); + neg_mask, _mm512_mask_blend_ps(le_zero_mask, _mm512_setzero_ps(), p16f_inf), p16f_nan); // Do a single step of Newton's iteration. x = pmul(x, pmadd(neg_half, pmul(x, x), p16f_one_point_five)); // Insert NaNs and Infs in all the right places. - return _mm512_mask_blend_ps(le_zero_mask, infs_and_nans, x); + return _mm512_mask_blend_ps(le_zero_mask, x, infs_and_nans); } template <> @@ -363,14 +358,12 @@ prsqrt(const Packet8d& _x) { // select only the inverse sqrt of positive normal inputs (denormals are // flushed to zero and cause infs as well). __mmask8 le_zero_mask = _mm512_cmp_pd_mask(_x, p8d_dbl_min, _CMP_LT_OQ); - Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), - _mm512_rsqrt14_pd(_x)); + Packet8d x = _mm512_mask_blend_pd(le_zero_mask, _mm512_rsqrt14_pd(_x), _mm512_setzero_pd()); // Fill in NaNs and Infs for the negative/zero entries. __mmask8 neg_mask = _mm512_cmp_pd_mask(_x, _mm512_setzero_pd(), _CMP_LT_OQ); Packet8d infs_and_nans = _mm512_mask_blend_pd( - neg_mask, p8d_nan, - _mm512_mask_blend_pd(le_zero_mask, p8d_inf, _mm512_setzero_pd())); + neg_mask, _mm512_mask_blend_pd(le_zero_mask, _mm512_setzero_pd(), p8d_inf), p8d_nan); // Do a first step of Newton's iteration. x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); @@ -379,9 +372,9 @@ prsqrt(const Packet8d& _x) { x = pmul(x, pmadd(neg_half, pmul(x, x), p8d_one_point_five)); // Insert NaNs and Infs in all the right places. - return _mm512_mask_blend_pd(le_zero_mask, infs_and_nans, x); + return _mm512_mask_blend_pd(le_zero_mask, x, infs_and_nans); } -#else +#elif defined(EIGEN_VECTORIZE_AVX512ER) template <> EIGEN_STRONG_INLINE Packet16f prsqrt(const Packet16f& x) { return _mm512_rsqrt28_ps(x); diff --git a/eigenlib/Eigen/src/Core/arch/AVX512/PacketMath.h b/eigenlib/Eigen/src/Core/arch/AVX512/PacketMath.h index f6500a16..000b7762 100644 --- a/eigenlib/Eigen/src/Core/arch/AVX512/PacketMath.h +++ b/eigenlib/Eigen/src/Core/arch/AVX512/PacketMath.h @@ -19,10 +19,10 @@ namespace internal { #endif #ifndef EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS -#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS (2*sizeof(void*)) +#define EIGEN_ARCH_DEFAULT_NUMBER_OF_REGISTERS 32 #endif -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA #ifndef EIGEN_HAS_SINGLE_INSTRUCTION_MADD #define EIGEN_HAS_SINGLE_INSTRUCTION_MADD #endif @@ -54,13 +54,14 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size = 16, HasHalfPacket = 1, -#if EIGEN_GNUC_AT_LEAST(5, 3) + HasBlend = 0, +#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) #ifdef EIGEN_VECTORIZE_AVX512DQ HasLog = 1, #endif HasExp = 1, - HasSqrt = 1, - HasRsqrt = 1, + HasSqrt = EIGEN_FAST_MATH, + HasRsqrt = EIGEN_FAST_MATH, #endif HasDiv = 1 }; @@ -74,8 +75,8 @@ template<> struct packet_traits : default_packet_traits AlignedOnScalar = 1, size = 8, HasHalfPacket = 1, -#if EIGEN_GNUC_AT_LEAST(5, 3) - HasSqrt = 1, +#if EIGEN_GNUC_AT_LEAST(5, 3) || (!EIGEN_COMP_GNUC_STRICT) + HasSqrt = EIGEN_FAST_MATH, HasRsqrt = EIGEN_FAST_MATH, #endif HasDiv = 1 @@ -98,6 +99,7 @@ template <> struct unpacket_traits { typedef float type; typedef Packet8f half; + typedef Packet16i integer_packet; enum { size = 16, alignment=Aligned64 }; }; template <> @@ -132,7 +134,7 @@ EIGEN_STRONG_INLINE Packet16f pload1(const float* from) { } template <> EIGEN_STRONG_INLINE Packet8d pload1(const double* from) { - return _mm512_broadcastsd_pd(_mm_load_pd1(from)); + return _mm512_set1_pd(*from); } template <> @@ -158,6 +160,11 @@ EIGEN_STRONG_INLINE Packet8d padd(const Packet8d& a, const Packet8d& b) { return _mm512_add_pd(a, b); } +template <> +EIGEN_STRONG_INLINE Packet16i padd(const Packet16i& a, + const Packet16i& b) { + return _mm512_add_epi32(a, b); +} template <> EIGEN_STRONG_INLINE Packet16f psub(const Packet16f& a, @@ -169,6 +176,11 @@ EIGEN_STRONG_INLINE Packet8d psub(const Packet8d& a, const Packet8d& b) { return _mm512_sub_pd(a, b); } +template <> +EIGEN_STRONG_INLINE Packet16i psub(const Packet16i& a, + const Packet16i& b) { + return _mm512_sub_epi32(a, b); +} template <> EIGEN_STRONG_INLINE Packet16f pnegate(const Packet16f& a) { @@ -202,6 +214,11 @@ EIGEN_STRONG_INLINE Packet8d pmul(const Packet8d& a, const Packet8d& b) { return _mm512_mul_pd(a, b); } +template <> +EIGEN_STRONG_INLINE Packet16i pmul(const Packet16i& a, + const Packet16i& b) { + return _mm512_mul_epi32(a, b); +} template <> EIGEN_STRONG_INLINE Packet16f pdiv(const Packet16f& a, @@ -214,7 +231,7 @@ EIGEN_STRONG_INLINE Packet8d pdiv(const Packet8d& a, return _mm512_div_pd(a, b); } -#ifdef __FMA__ +#ifdef EIGEN_VECTORIZE_FMA template <> EIGEN_STRONG_INLINE Packet16f pmadd(const Packet16f& a, const Packet16f& b, const Packet16f& c) { @@ -230,23 +247,73 @@ EIGEN_STRONG_INLINE Packet8d pmadd(const Packet8d& a, const Packet8d& b, template <> EIGEN_STRONG_INLINE Packet16f pmin(const Packet16f& a, const Packet16f& b) { - return _mm512_min_ps(a, b); + // Arguments are reversed to match NaN propagation behavior of std::min. + return _mm512_min_ps(b, a); } template <> EIGEN_STRONG_INLINE Packet8d pmin(const Packet8d& a, const Packet8d& b) { - return _mm512_min_pd(a, b); + // Arguments are reversed to match NaN propagation behavior of std::min. + return _mm512_min_pd(b, a); } template <> EIGEN_STRONG_INLINE Packet16f pmax(const Packet16f& a, const Packet16f& b) { - return _mm512_max_ps(a, b); + // Arguments are reversed to match NaN propagation behavior of std::max. + return _mm512_max_ps(b, a); } template <> EIGEN_STRONG_INLINE Packet8d pmax(const Packet8d& a, const Packet8d& b) { - return _mm512_max_pd(a, b); + // Arguments are reversed to match NaN propagation behavior of std::max. + return _mm512_max_pd(b, a); +} + +#ifdef EIGEN_VECTORIZE_AVX512DQ +template EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { return _mm512_extractf32x8_ps(x,I_); } +template EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { return _mm512_extractf64x2_pd(x,I_); } +EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { return _mm512_insertf32x8(_mm512_castps256_ps512(a),b,1); } +#else +// AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512 +template EIGEN_STRONG_INLINE Packet8f extract256(Packet16f x) { + return _mm256_castsi256_ps(_mm512_extracti64x4_epi64( _mm512_castps_si512(x),I_)); +} + +// AVX512F does not define _mm512_extractf64x2_pd to extract _m128 from _m512 +template EIGEN_STRONG_INLINE Packet2d extract128(Packet8d x) { + return _mm_castsi128_pd(_mm512_extracti32x4_epi32( _mm512_castpd_si512(x),I_)); +} + +EIGEN_STRONG_INLINE Packet16f cat256(Packet8f a, Packet8f b) { + return _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castsi256_si512(_mm256_castps_si256(a)), + _mm256_castps_si256(b),1)); +} +#endif + +// Helper function for bit packing snippet of low precision comparison. +// It packs the flags from 32x16 to 16x16. +EIGEN_STRONG_INLINE __m256i Pack32To16(Packet16f rf) { + // Split data into small pieces and handle with AVX instructions + // to guarantee internal order of vector. + // Operation: + // dst[15:0] := Saturate16(rf[31:0]) + // dst[31:16] := Saturate16(rf[63:32]) + // ... + // dst[255:240] := Saturate16(rf[255:224]) + __m256i lo = _mm256_castps_si256(extract256<0>(rf)); + __m256i hi = _mm256_castps_si256(extract256<1>(rf)); + __m128i result_lo = _mm_packs_epi32(_mm256_extractf128_si256(lo, 0), + _mm256_extractf128_si256(lo, 1)); + __m128i result_hi = _mm_packs_epi32(_mm256_extractf128_si256(hi, 0), + _mm256_extractf128_si256(hi, 1)); + return _mm256_insertf128_si256(_mm256_castsi128_si256(result_lo), result_hi, 1); +} + +template <> +EIGEN_STRONG_INLINE Packet16i pand(const Packet16i& a, + const Packet16i& b) { + return _mm512_and_si512(a,b); } template <> @@ -255,24 +322,7 @@ EIGEN_STRONG_INLINE Packet16f pand(const Packet16f& a, #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_and_ps(a, b); #else - Packet16f res = _mm512_undefined_ps(); - Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); - Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); - res = _mm512_insertf32x4(res, _mm_and_ps(lane0_a, lane0_b), 0); - - Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); - Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); - res = _mm512_insertf32x4(res, _mm_and_ps(lane1_a, lane1_b), 1); - - Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); - Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); - res = _mm512_insertf32x4(res, _mm_and_ps(lane2_a, lane2_b), 2); - - Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); - Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); - res = _mm512_insertf32x4(res, _mm_and_ps(lane3_a, lane3_b), 3); - - return res; + return _mm512_castsi512_ps(pand(_mm512_castps_si512(a),_mm512_castps_si512(b))); #endif } template <> @@ -288,35 +338,21 @@ EIGEN_STRONG_INLINE Packet8d pand(const Packet8d& a, Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); - res = _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1); - - return res; + return _mm512_insertf64x4(res, _mm256_and_pd(lane1_a, lane1_b), 1); #endif } + template <> -EIGEN_STRONG_INLINE Packet16f por(const Packet16f& a, - const Packet16f& b) { +EIGEN_STRONG_INLINE Packet16i por(const Packet16i& a, const Packet16i& b) { + return _mm512_or_si512(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16f por(const Packet16f& a, const Packet16f& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_or_ps(a, b); #else - Packet16f res = _mm512_undefined_ps(); - Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); - Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); - res = _mm512_insertf32x4(res, _mm_or_ps(lane0_a, lane0_b), 0); - - Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); - Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); - res = _mm512_insertf32x4(res, _mm_or_ps(lane1_a, lane1_b), 1); - - Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); - Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); - res = _mm512_insertf32x4(res, _mm_or_ps(lane2_a, lane2_b), 2); - - Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); - Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); - res = _mm512_insertf32x4(res, _mm_or_ps(lane3_a, lane3_b), 3); - - return res; + return _mm512_castsi512_ps(por(_mm512_castps_si512(a),_mm512_castps_si512(b))); #endif } @@ -326,109 +362,67 @@ EIGEN_STRONG_INLINE Packet8d por(const Packet8d& a, #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_or_pd(a, b); #else - Packet8d res = _mm512_undefined_pd(); - Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); - Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); - res = _mm512_insertf64x4(res, _mm256_or_pd(lane0_a, lane0_b), 0); - - Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); - Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); - res = _mm512_insertf64x4(res, _mm256_or_pd(lane1_a, lane1_b), 1); - - return res; + return _mm512_castsi512_pd(por(_mm512_castpd_si512(a),_mm512_castpd_si512(b))); #endif } template <> -EIGEN_STRONG_INLINE Packet16f pxor(const Packet16f& a, - const Packet16f& b) { +EIGEN_STRONG_INLINE Packet16i pxor(const Packet16i& a, const Packet16i& b) { + return _mm512_xor_si512(a, b); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pxor(const Packet16f& a, const Packet16f& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_xor_ps(a, b); #else - Packet16f res = _mm512_undefined_ps(); - Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); - Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); - res = _mm512_insertf32x4(res, _mm_xor_ps(lane0_a, lane0_b), 0); - - Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); - Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); - res = _mm512_insertf32x4(res, _mm_xor_ps(lane1_a, lane1_b), 1); - - Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); - Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); - res = _mm512_insertf32x4(res, _mm_xor_ps(lane2_a, lane2_b), 2); - - Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); - Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); - res = _mm512_insertf32x4(res, _mm_xor_ps(lane3_a, lane3_b), 3); - - return res; + return _mm512_castsi512_ps(pxor(_mm512_castps_si512(a),_mm512_castps_si512(b))); #endif } + template <> -EIGEN_STRONG_INLINE Packet8d pxor(const Packet8d& a, - const Packet8d& b) { +EIGEN_STRONG_INLINE Packet8d pxor(const Packet8d& a, const Packet8d& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ return _mm512_xor_pd(a, b); #else - Packet8d res = _mm512_undefined_pd(); - Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); - Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); - res = _mm512_insertf64x4(res, _mm256_xor_pd(lane0_a, lane0_b), 0); - - Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); - Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); - res = _mm512_insertf64x4(res, _mm256_xor_pd(lane1_a, lane1_b), 1); - - return res; + return _mm512_castsi512_pd(pxor(_mm512_castpd_si512(a),_mm512_castpd_si512(b))); #endif } template <> -EIGEN_STRONG_INLINE Packet16f pandnot(const Packet16f& a, - const Packet16f& b) { +EIGEN_STRONG_INLINE Packet16i pandnot(const Packet16i& a, const Packet16i& b) { + return _mm512_andnot_si512(b, a); +} + +template <> +EIGEN_STRONG_INLINE Packet16f pandnot(const Packet16f& a, const Packet16f& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ - return _mm512_andnot_ps(a, b); + return _mm512_andnot_ps(b, a); #else - Packet16f res = _mm512_undefined_ps(); - Packet4f lane0_a = _mm512_extractf32x4_ps(a, 0); - Packet4f lane0_b = _mm512_extractf32x4_ps(b, 0); - res = _mm512_insertf32x4(res, _mm_andnot_ps(lane0_a, lane0_b), 0); - - Packet4f lane1_a = _mm512_extractf32x4_ps(a, 1); - Packet4f lane1_b = _mm512_extractf32x4_ps(b, 1); - res = _mm512_insertf32x4(res, _mm_andnot_ps(lane1_a, lane1_b), 1); - - Packet4f lane2_a = _mm512_extractf32x4_ps(a, 2); - Packet4f lane2_b = _mm512_extractf32x4_ps(b, 2); - res = _mm512_insertf32x4(res, _mm_andnot_ps(lane2_a, lane2_b), 2); - - Packet4f lane3_a = _mm512_extractf32x4_ps(a, 3); - Packet4f lane3_b = _mm512_extractf32x4_ps(b, 3); - res = _mm512_insertf32x4(res, _mm_andnot_ps(lane3_a, lane3_b), 3); - - return res; + return _mm512_castsi512_ps(pandnot(_mm512_castps_si512(a),_mm512_castps_si512(b))); #endif } template <> -EIGEN_STRONG_INLINE Packet8d pandnot(const Packet8d& a, - const Packet8d& b) { +EIGEN_STRONG_INLINE Packet8d pandnot(const Packet8d& a,const Packet8d& b) { #ifdef EIGEN_VECTORIZE_AVX512DQ - return _mm512_andnot_pd(a, b); + return _mm512_andnot_pd(b, a); #else - Packet8d res = _mm512_undefined_pd(); - Packet4d lane0_a = _mm512_extractf64x4_pd(a, 0); - Packet4d lane0_b = _mm512_extractf64x4_pd(b, 0); - res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane0_a, lane0_b), 0); - - Packet4d lane1_a = _mm512_extractf64x4_pd(a, 1); - Packet4d lane1_b = _mm512_extractf64x4_pd(b, 1); - res = _mm512_insertf64x4(res, _mm256_andnot_pd(lane1_a, lane1_b), 1); - - return res; + return _mm512_castsi512_pd(pandnot(_mm512_castpd_si512(a),_mm512_castpd_si512(b))); #endif } +template EIGEN_STRONG_INLINE Packet16i parithmetic_shift_right(Packet16i a) { + return _mm512_srai_epi32(a, N); +} + +template EIGEN_STRONG_INLINE Packet16i plogical_shift_right(Packet16i a) { + return _mm512_srli_epi32(a, N); +} + +template EIGEN_STRONG_INLINE Packet16i plogical_shift_left(Packet16i a) { + return _mm512_slli_epi32(a, N); +} + template <> EIGEN_STRONG_INLINE Packet16f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return _mm512_load_ps(from); @@ -461,75 +455,55 @@ EIGEN_STRONG_INLINE Packet16i ploadu(const int* from) { // {a0, a0 a1, a1, a2, a2, a3, a3, a4, a4, a5, a5, a6, a6, a7, a7} template <> EIGEN_STRONG_INLINE Packet16f ploaddup(const float* from) { - Packet8f lane0 = _mm256_broadcast_ps((const __m128*)(const void*)from); - // mimic an "inplace" permutation of the lower 128bits using a blend - lane0 = _mm256_blend_ps( - lane0, _mm256_castps128_ps256(_mm_permute_ps( - _mm256_castps256_ps128(lane0), _MM_SHUFFLE(1, 0, 1, 0))), - 15); - // then we can perform a consistent permutation on the global register to get - // everything in shape: - lane0 = _mm256_permute_ps(lane0, _MM_SHUFFLE(3, 3, 2, 2)); - - Packet8f lane1 = _mm256_broadcast_ps((const __m128*)(const void*)(from + 4)); - // mimic an "inplace" permutation of the lower 128bits using a blend - lane1 = _mm256_blend_ps( - lane1, _mm256_castps128_ps256(_mm_permute_ps( - _mm256_castps256_ps128(lane1), _MM_SHUFFLE(1, 0, 1, 0))), - 15); - // then we can perform a consistent permutation on the global register to get - // everything in shape: - lane1 = _mm256_permute_ps(lane1, _MM_SHUFFLE(3, 3, 2, 2)); + // an unaligned load is required here as there is no requirement + // on the alignment of input pointer 'from' + __m256i low_half = _mm256_loadu_si256(reinterpret_cast(from)); + __m512 even_elements = _mm512_castsi512_ps(_mm512_cvtepu32_epi64(low_half)); + __m512 pairs = _mm512_permute_ps(even_elements, _MM_SHUFFLE(2, 2, 0, 0)); + return pairs; +} #ifdef EIGEN_VECTORIZE_AVX512DQ - Packet16f res = _mm512_undefined_ps(); - return _mm512_insertf32x8(res, lane0, 0); - return _mm512_insertf32x8(res, lane1, 1); - return res; -#else - Packet16f res = _mm512_undefined_ps(); - res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 0), 0); - res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane0, 1), 1); - res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 0), 2); - res = _mm512_insertf32x4(res, _mm256_extractf128_ps(lane1, 1), 3); - return res; -#endif -} +// FIXME: this does not look optimal, better load a Packet4d and shuffle... // Loads 4 doubles from memory a returns the packet {a0, a0 a1, a1, a2, a2, a3, // a3} template <> EIGEN_STRONG_INLINE Packet8d ploaddup(const double* from) { - Packet4d lane0 = _mm256_broadcast_pd((const __m128d*)(const void*)from); - lane0 = _mm256_permute_pd(lane0, 3 << 2); - - Packet4d lane1 = _mm256_broadcast_pd((const __m128d*)(const void*)(from + 2)); - lane1 = _mm256_permute_pd(lane1, 3 << 2); - - Packet8d res = _mm512_undefined_pd(); - res = _mm512_insertf64x4(res, lane0, 0); - return _mm512_insertf64x4(res, lane1, 1); + __m512d x = _mm512_setzero_pd(); + x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[0]), 0); + x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[1]), 1); + x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[2]), 2); + x = _mm512_insertf64x2(x, _mm_loaddup_pd(&from[3]), 3); + return x; } +#else +template <> +EIGEN_STRONG_INLINE Packet8d ploaddup(const double* from) { + __m512d x = _mm512_setzero_pd(); + x = _mm512_mask_broadcastsd_pd(x, 0x3<<0, _mm_load_sd(from+0)); + x = _mm512_mask_broadcastsd_pd(x, 0x3<<2, _mm_load_sd(from+1)); + x = _mm512_mask_broadcastsd_pd(x, 0x3<<4, _mm_load_sd(from+2)); + x = _mm512_mask_broadcastsd_pd(x, 0x3<<6, _mm_load_sd(from+3)); + return x; +} +#endif // Loads 4 floats from memory a returns the packet // {a0, a0 a0, a0, a1, a1, a1, a1, a2, a2, a2, a2, a3, a3, a3, a3} template <> EIGEN_STRONG_INLINE Packet16f ploadquad(const float* from) { - Packet16f tmp = _mm512_undefined_ps(); - tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from), 0); - tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 1), 1); - tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 2), 2); - tmp = _mm512_insertf32x4(tmp, _mm_load_ps1(from + 3), 3); - return tmp; + Packet16f tmp = _mm512_castps128_ps512(ploadu(from)); + const Packet16i scatter_mask = _mm512_set_epi32(3,3,3,3, 2,2,2,2, 1,1,1,1, 0,0,0,0); + return _mm512_permutexvar_ps(scatter_mask, tmp); } + // Loads 2 doubles from memory a returns the packet // {a0, a0 a0, a0, a1, a1, a1, a1} template <> EIGEN_STRONG_INLINE Packet8d ploadquad(const double* from) { - Packet8d tmp = _mm512_undefined_pd(); - Packet2d tmp0 = _mm_load_pd1(from); - Packet2d tmp1 = _mm_load_pd1(from + 1); - Packet4d lane0 = _mm256_broadcastsd_pd(tmp0); - Packet4d lane1 = _mm256_broadcastsd_pd(tmp1); + __m256d lane0 = _mm256_set1_pd(*from); + __m256d lane1 = _mm256_set1_pd(*(from+1)); + __m512d tmp = _mm512_undefined_pd(); tmp = _mm512_insertf64x4(tmp, lane0, 0); return _mm512_insertf64x4(tmp, lane1, 1); } @@ -565,7 +539,7 @@ EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet16i& from) { template <> EIGEN_DEVICE_FUNC inline Packet16f pgather(const float* from, Index stride) { - Packet16i stride_vector = _mm512_set1_epi32(stride); + Packet16i stride_vector = _mm512_set1_epi32(convert_index(stride)); Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); @@ -575,7 +549,7 @@ EIGEN_DEVICE_FUNC inline Packet16f pgather(const float* from, template <> EIGEN_DEVICE_FUNC inline Packet8d pgather(const double* from, Index stride) { - Packet8i stride_vector = _mm256_set1_epi32(stride); + Packet8i stride_vector = _mm256_set1_epi32(convert_index(stride)); Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); @@ -586,7 +560,7 @@ template <> EIGEN_DEVICE_FUNC inline void pscatter(float* to, const Packet16f& from, Index stride) { - Packet16i stride_vector = _mm512_set1_epi32(stride); + Packet16i stride_vector = _mm512_set1_epi32(convert_index(stride)); Packet16i stride_multiplier = _mm512_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); Packet16i indices = _mm512_mullo_epi32(stride_vector, stride_multiplier); @@ -596,7 +570,7 @@ template <> EIGEN_DEVICE_FUNC inline void pscatter(double* to, const Packet8d& from, Index stride) { - Packet8i stride_vector = _mm256_set1_epi32(stride); + Packet8i stride_vector = _mm256_set1_epi32(convert_index(stride)); Packet8i stride_multiplier = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); Packet8i indices = _mm256_mullo_epi32(stride_vector, stride_multiplier); _mm512_i32scatter_pd(to, indices, from, 8); @@ -618,9 +592,9 @@ EIGEN_STRONG_INLINE void pstore1(int* to, const int& a) { pstore(to, pa); } -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } template <> EIGEN_STRONG_INLINE float pfirst(const Packet16f& a) { @@ -648,20 +622,20 @@ template<> EIGEN_STRONG_INLINE Packet8d preverse(const Packet8d& a) template<> EIGEN_STRONG_INLINE Packet16f pabs(const Packet16f& a) { // _mm512_abs_ps intrinsic not found, so hack around it - return (__m512)_mm512_and_si512((__m512i)a, _mm512_set1_epi32(0x7fffffff)); + return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x7fffffff))); } template <> EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) { // _mm512_abs_ps intrinsic not found, so hack around it - return (__m512d)_mm512_and_si512((__m512i)a, - _mm512_set1_epi64(0x7fffffffffffffff)); + return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a), + _mm512_set1_epi64(0x7fffffffffffffff))); } #ifdef EIGEN_VECTORIZE_AVX512DQ // AVX512F does not define _mm512_extractf32x8_ps to extract _m256 from _m512 #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \ - __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0) __m256 OUTPUT##_1 = \ - _mm512_extractf32x8_ps(INPUT, 1) + __m256 OUTPUT##_0 = _mm512_extractf32x8_ps(INPUT, 0); \ + __m256 OUTPUT##_1 = _mm512_extractf32x8_ps(INPUT, 1) #else #define EIGEN_EXTRACT_8f_FROM_16f(INPUT, OUTPUT) \ __m256 OUTPUT##_0 = _mm256_insertf128_ps( \ @@ -674,17 +648,136 @@ EIGEN_STRONG_INLINE Packet8d pabs(const Packet8d& a) { #ifdef EIGEN_VECTORIZE_AVX512DQ #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ - OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTA, 0); \ - OUTPUT = _mm512_insertf32x8(OUTPUT, INPUTB, 1); + OUTPUT = _mm512_insertf32x8(_mm512_castps256_ps512(INPUTA), INPUTB, 1); #else #define EIGEN_INSERT_8f_INTO_16f(OUTPUT, INPUTA, INPUTB) \ + OUTPUT = _mm512_undefined_ps(); \ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 0), 0); \ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTA, 1), 1); \ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 0), 2); \ OUTPUT = _mm512_insertf32x4(OUTPUT, _mm256_extractf128_ps(INPUTB, 1), 3); #endif -template<> EIGEN_STRONG_INLINE Packet16f preduxp(const Packet16f* -vecs) + +template <> +EIGEN_STRONG_INLINE float predux(const Packet16f& a) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + __m256 lane0 = _mm512_extractf32x8_ps(a, 0); + __m256 lane1 = _mm512_extractf32x8_ps(a, 1); + Packet8f x = _mm256_add_ps(lane0, lane1); + return predux(x); +#else + __m128 lane0 = _mm512_extractf32x4_ps(a, 0); + __m128 lane1 = _mm512_extractf32x4_ps(a, 1); + __m128 lane2 = _mm512_extractf32x4_ps(a, 2); + __m128 lane3 = _mm512_extractf32x4_ps(a, 3); + __m128 sum = _mm_add_ps(_mm_add_ps(lane0, lane1), _mm_add_ps(lane2, lane3)); + sum = _mm_hadd_ps(sum, sum); + sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1)); + return _mm_cvtss_f32(sum); +#endif +} +template <> +EIGEN_STRONG_INLINE double predux(const Packet8d& a) { + __m256d lane0 = _mm512_extractf64x4_pd(a, 0); + __m256d lane1 = _mm512_extractf64x4_pd(a, 1); + __m256d sum = _mm256_add_pd(lane0, lane1); + __m256d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1)); + return _mm_cvtsd_f64(_mm256_castpd256_pd128(_mm256_hadd_pd(tmp0, tmp0))); +} + +template <> +EIGEN_STRONG_INLINE Packet8f predux_downto4(const Packet16f& a) { +#ifdef EIGEN_VECTORIZE_AVX512DQ + Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); + Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); + return padd(lane0, lane1); +#else + Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); + Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); + Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); + Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); + Packet4f sum0 = padd(lane0, lane2); + Packet4f sum1 = padd(lane1, lane3); + return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1); +#endif +} +template <> +EIGEN_STRONG_INLINE Packet4d predux_downto4(const Packet8d& a) { + Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); + Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); + Packet4d res = padd(lane0, lane1); + return res; +} + +template <> +EIGEN_STRONG_INLINE float predux_mul(const Packet16f& a) { +//#ifdef EIGEN_VECTORIZE_AVX512DQ +#if 0 + Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); + Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); + Packet8f res = pmul(lane0, lane1); + res = pmul(res, _mm256_permute2f128_ps(res, res, 1)); + res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); + return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); +#else + __m128 lane0 = _mm512_extractf32x4_ps(a, 0); + __m128 lane1 = _mm512_extractf32x4_ps(a, 1); + __m128 lane2 = _mm512_extractf32x4_ps(a, 2); + __m128 lane3 = _mm512_extractf32x4_ps(a, 3); + __m128 res = pmul(pmul(lane0, lane1), pmul(lane2, lane3)); + res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); + return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); +#endif +} +template <> +EIGEN_STRONG_INLINE double predux_mul(const Packet8d& a) { + __m256d lane0 = _mm512_extractf64x4_pd(a, 0); + __m256d lane1 = _mm512_extractf64x4_pd(a, 1); + __m256d res = pmul(lane0, lane1); + res = pmul(res, _mm256_permute2f128_pd(res, res, 1)); + return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1))); +} + +template <> +EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) { + __m128 lane0 = _mm512_extractf32x4_ps(a, 0); + __m128 lane1 = _mm512_extractf32x4_ps(a, 1); + __m128 lane2 = _mm512_extractf32x4_ps(a, 2); + __m128 lane3 = _mm512_extractf32x4_ps(a, 3); + __m128 res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3)); + res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); + return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); +} +template <> +EIGEN_STRONG_INLINE double predux_min(const Packet8d& a) { + __m256d lane0 = _mm512_extractf64x4_pd(a, 0); + __m256d lane1 = _mm512_extractf64x4_pd(a, 1); + __m256d res = _mm256_min_pd(lane0, lane1); + res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1)); + return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1))); +} + +template <> +EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) { + __m128 lane0 = _mm512_extractf32x4_ps(a, 0); + __m128 lane1 = _mm512_extractf32x4_ps(a, 1); + __m128 lane2 = _mm512_extractf32x4_ps(a, 2); + __m128 lane3 = _mm512_extractf32x4_ps(a, 3); + __m128 res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3)); + res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); + return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); +} + +template <> +EIGEN_STRONG_INLINE double predux_max(const Packet8d& a) { + __m256d lane0 = _mm512_extractf64x4_pd(a, 0); + __m256d lane1 = _mm512_extractf64x4_pd(a, 1); + __m256d res = _mm256_max_pd(lane0, lane1); + res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1)); + return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1))); +} + +template<> EIGEN_STRONG_INLINE Packet16f preduxp(const Packet16f* vecs) { EIGEN_EXTRACT_8f_FROM_16f(vecs[0], vecs0); EIGEN_EXTRACT_8f_FROM_16f(vecs[1], vecs1); @@ -873,174 +966,7 @@ template<> EIGEN_STRONG_INLINE Packet8d preduxp(const Packet8d* vecs) return _mm512_insertf64x4(final_output, final_1, 1); } - -template <> -EIGEN_STRONG_INLINE float predux(const Packet16f& a) { - //#ifdef EIGEN_VECTORIZE_AVX512DQ -#if 0 - Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); - Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); - Packet8f sum = padd(lane0, lane1); - Packet8f tmp0 = _mm256_hadd_ps(sum, _mm256_permute2f128_ps(a, a, 1)); - tmp0 = _mm256_hadd_ps(tmp0, tmp0); - return pfirst(_mm256_hadd_ps(tmp0, tmp0)); -#else - Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); - Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); - Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); - Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); - Packet4f sum = padd(padd(lane0, lane1), padd(lane2, lane3)); - sum = _mm_hadd_ps(sum, sum); - sum = _mm_hadd_ps(sum, _mm_permute_ps(sum, 1)); - return pfirst(sum); -#endif -} -template <> -EIGEN_STRONG_INLINE double predux(const Packet8d& a) { - Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); - Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); - Packet4d sum = padd(lane0, lane1); - Packet4d tmp0 = _mm256_hadd_pd(sum, _mm256_permute2f128_pd(sum, sum, 1)); - return pfirst(_mm256_hadd_pd(tmp0, tmp0)); -} - -template <> -EIGEN_STRONG_INLINE Packet8f predux_downto4(const Packet16f& a) { -#ifdef EIGEN_VECTORIZE_AVX512DQ - Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); - Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); - return padd(lane0, lane1); -#else - Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); - Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); - Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); - Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); - Packet4f sum0 = padd(lane0, lane2); - Packet4f sum1 = padd(lane1, lane3); - return _mm256_insertf128_ps(_mm256_castps128_ps256(sum0), sum1, 1); -#endif -} -template <> -EIGEN_STRONG_INLINE Packet4d predux_downto4(const Packet8d& a) { - Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); - Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); - Packet4d res = padd(lane0, lane1); - return res; -} - -template <> -EIGEN_STRONG_INLINE float predux_mul(const Packet16f& a) { -//#ifdef EIGEN_VECTORIZE_AVX512DQ -#if 0 - Packet8f lane0 = _mm512_extractf32x8_ps(a, 0); - Packet8f lane1 = _mm512_extractf32x8_ps(a, 1); - Packet8f res = pmul(lane0, lane1); - res = pmul(res, _mm256_permute2f128_ps(res, res, 1)); - res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); -#else - Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); - Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); - Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); - Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); - Packet4f res = pmul(pmul(lane0, lane1), pmul(lane2, lane3)); - res = pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst(pmul(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); -#endif -} -template <> -EIGEN_STRONG_INLINE double predux_mul(const Packet8d& a) { - Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); - Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); - Packet4d res = pmul(lane0, lane1); - res = pmul(res, _mm256_permute2f128_pd(res, res, 1)); - return pfirst(pmul(res, _mm256_shuffle_pd(res, res, 1))); -} - -template <> -EIGEN_STRONG_INLINE float predux_min(const Packet16f& a) { - Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); - Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); - Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); - Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); - Packet4f res = _mm_min_ps(_mm_min_ps(lane0, lane1), _mm_min_ps(lane2, lane3)); - res = _mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst(_mm_min_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); -} -template <> -EIGEN_STRONG_INLINE double predux_min(const Packet8d& a) { - Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); - Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); - Packet4d res = _mm256_min_pd(lane0, lane1); - res = _mm256_min_pd(res, _mm256_permute2f128_pd(res, res, 1)); - return pfirst(_mm256_min_pd(res, _mm256_shuffle_pd(res, res, 1))); -} - -template <> -EIGEN_STRONG_INLINE float predux_max(const Packet16f& a) { - Packet4f lane0 = _mm512_extractf32x4_ps(a, 0); - Packet4f lane1 = _mm512_extractf32x4_ps(a, 1); - Packet4f lane2 = _mm512_extractf32x4_ps(a, 2); - Packet4f lane3 = _mm512_extractf32x4_ps(a, 3); - Packet4f res = _mm_max_ps(_mm_max_ps(lane0, lane1), _mm_max_ps(lane2, lane3)); - res = _mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 3, 2))); - return pfirst(_mm_max_ps(res, _mm_permute_ps(res, _MM_SHUFFLE(0, 0, 0, 1)))); -} -template <> -EIGEN_STRONG_INLINE double predux_max(const Packet8d& a) { - Packet4d lane0 = _mm512_extractf64x4_pd(a, 0); - Packet4d lane1 = _mm512_extractf64x4_pd(a, 1); - Packet4d res = _mm256_max_pd(lane0, lane1); - res = _mm256_max_pd(res, _mm256_permute2f128_pd(res, res, 1)); - return pfirst(_mm256_max_pd(res, _mm256_shuffle_pd(res, res, 1))); -} - -template -struct palign_impl { - static EIGEN_STRONG_INLINE void run(Packet16f& first, - const Packet16f& second) { - if (Offset != 0) { - __m512i first_idx = _mm512_set_epi32( - Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11, - Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6, - Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset); - - __m512i second_idx = - _mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4, - Offset - 5, Offset - 6, Offset - 7, Offset - 8, - Offset - 9, Offset - 10, Offset - 11, Offset - 12, - Offset - 13, Offset - 14, Offset - 15, Offset - 16); - - unsigned short mask = 0xFFFF; - mask <<= (16 - Offset); - - first = _mm512_permutexvar_ps(first_idx, first); - Packet16f tmp = _mm512_permutexvar_ps(second_idx, second); - first = _mm512_mask_blend_ps(mask, first, tmp); - } - } -}; -template -struct palign_impl { - static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) { - if (Offset != 0) { - __m512i first_idx = _mm512_set_epi32( - 0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0, - Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset); - - __m512i second_idx = _mm512_set_epi32( - 0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0, - Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8); - - unsigned char mask = 0xFF; - mask <<= (8 - Offset); - - first = _mm512_permutexvar_pd(first_idx, first); - Packet8d tmp = _mm512_permutexvar_pd(second_idx, second); - first = _mm512_mask_blend_pd(mask, first, tmp); - } - } -}; + #define PACK_OUTPUT(OUTPUT, INPUT, INDEX, STRIDE) \ @@ -1302,13 +1228,76 @@ EIGEN_STRONG_INLINE Packet16f pblend(const Selector<16>& /*ifPacket*/, return Packet16f(); } template <> -EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& /*ifPacket*/, - const Packet8d& /*thenPacket*/, - const Packet8d& /*elsePacket*/) { - assert(false && "To be implemented"); - return Packet8d(); +EIGEN_STRONG_INLINE Packet8d pblend(const Selector<8>& ifPacket, + const Packet8d& thenPacket, + const Packet8d& elsePacket) { + __mmask8 m = (ifPacket.select[0] ) + | (ifPacket.select[1]<<1) + | (ifPacket.select[2]<<2) + | (ifPacket.select[3]<<3) + | (ifPacket.select[4]<<4) + | (ifPacket.select[5]<<5) + | (ifPacket.select[6]<<6) + | (ifPacket.select[7]<<7); + return _mm512_mask_blend_pd(m, elsePacket, thenPacket); } +template<> EIGEN_STRONG_INLINE Packet16i pcast(const Packet16f& a) { + return _mm512_cvttps_epi32(a); +} + +template<> EIGEN_STRONG_INLINE Packet16f pcast(const Packet16i& a) { + return _mm512_cvtepi32_ps(a); +} + +template +struct palign_impl { + static EIGEN_STRONG_INLINE void run(Packet16f& first, + const Packet16f& second) { + if (Offset != 0) { + __m512i first_idx = _mm512_set_epi32( + Offset + 15, Offset + 14, Offset + 13, Offset + 12, Offset + 11, + Offset + 10, Offset + 9, Offset + 8, Offset + 7, Offset + 6, + Offset + 5, Offset + 4, Offset + 3, Offset + 2, Offset + 1, Offset); + + __m512i second_idx = + _mm512_set_epi32(Offset - 1, Offset - 2, Offset - 3, Offset - 4, + Offset - 5, Offset - 6, Offset - 7, Offset - 8, + Offset - 9, Offset - 10, Offset - 11, Offset - 12, + Offset - 13, Offset - 14, Offset - 15, Offset - 16); + + unsigned short mask = 0xFFFF; + mask <<= (16 - Offset); + + first = _mm512_permutexvar_ps(first_idx, first); + Packet16f tmp = _mm512_permutexvar_ps(second_idx, second); + first = _mm512_mask_blend_ps(mask, first, tmp); + } + } +}; +template +struct palign_impl { + static EIGEN_STRONG_INLINE void run(Packet8d& first, const Packet8d& second) { + if (Offset != 0) { + __m512i first_idx = _mm512_set_epi32( + 0, Offset + 7, 0, Offset + 6, 0, Offset + 5, 0, Offset + 4, 0, + Offset + 3, 0, Offset + 2, 0, Offset + 1, 0, Offset); + + __m512i second_idx = _mm512_set_epi32( + 0, Offset - 1, 0, Offset - 2, 0, Offset - 3, 0, Offset - 4, 0, + Offset - 5, 0, Offset - 6, 0, Offset - 7, 0, Offset - 8); + + unsigned char mask = 0xFF; + mask <<= (8 - Offset); + + first = _mm512_permutexvar_pd(first_idx, first); + Packet8d tmp = _mm512_permutexvar_pd(second_idx, second); + first = _mm512_mask_blend_pd(mask, first, tmp); + } + } +}; + + } // end namespace internal } // end namespace Eigen diff --git a/eigenlib/Eigen/src/Core/arch/AltiVec/Complex.h b/eigenlib/Eigen/src/Core/arch/AltiVec/Complex.h index 59367ba2..3e665730 100644 --- a/eigenlib/Eigen/src/Core/arch/AltiVec/Complex.h +++ b/eigenlib/Eigen/src/Core/arch/AltiVec/Complex.h @@ -65,7 +65,7 @@ template<> struct unpacket_traits { typedef std::complex type; template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { Packet2cf res; - if((ptrdiff_t(&from) % 16) == 0) + if((std::ptrdiff_t(&from) % 16) == 0) res.v = pload((const float *)&from); else res.v = ploadu((const float *)&from); @@ -224,23 +224,7 @@ template<> struct conj_helper } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const - { return Packet2cf(internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const - { return Packet2cf(internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { @@ -416,23 +400,8 @@ template<> struct conj_helper return pconj(internal::pmul(a, b)); } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(c, pmul(x,y)); } - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const - { return Packet1cd(internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const - { return Packet1cd(internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { diff --git a/eigenlib/Eigen/src/Core/arch/AltiVec/PacketMath.h b/eigenlib/Eigen/src/Core/arch/AltiVec/PacketMath.h index e7d4f4d8..08a27d15 100755 --- a/eigenlib/Eigen/src/Core/arch/AltiVec/PacketMath.h +++ b/eigenlib/Eigen/src/Core/arch/AltiVec/PacketMath.h @@ -90,7 +90,7 @@ static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; #define _EIGEN_MASK_ALIGNMENT 0xfffffff0 #endif -#define _EIGEN_ALIGNED_PTR(x) ((ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) +#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) // Handle endianness properly while loading constants // Define global static constants: @@ -103,7 +103,7 @@ static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4u static Packet16uc p16uc_PSET32_WEVEN = vec_sld(p16uc_DUPLICATE32_HI, (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; static Packet16uc p16uc_HALF64_0_16 = vec_sld((Packet16uc)p4i_ZERO, vec_splat((Packet16uc) vec_abs(p4i_MINUS16), 3), 8); //{ 0,0,0,0, 0,0,0,0, 16,16,16,16, 16,16,16,16}; #else -static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; +static Packet16uc p16uc_FORWARD = p16uc_REVERSE32; static Packet16uc p16uc_REVERSE64 = { 8,9,10,11, 12,13,14,15, 0,1,2,3, 4,5,6,7 }; static Packet16uc p16uc_PSET32_WODD = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 1), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 3), 8);//{ 0,1,2,3, 0,1,2,3, 8,9,10,11, 8,9,10,11 }; static Packet16uc p16uc_PSET32_WEVEN = vec_sld((Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 0), (Packet16uc) vec_splat((Packet4ui)p16uc_FORWARD, 2), 8);//{ 4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15 }; @@ -388,10 +388,28 @@ template<> EIGEN_STRONG_INLINE Packet4i pdiv(const Packet4i& /*a*/, co template<> EIGEN_STRONG_INLINE Packet4f pmadd(const Packet4f& a, const Packet4f& b, const Packet4f& c) { return vec_madd(a,b,c); } template<> EIGEN_STRONG_INLINE Packet4i pmadd(const Packet4i& a, const Packet4i& b, const Packet4i& c) { return a*b + c; } -template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmin(const Packet4f& a, const Packet4f& b) +{ + #ifdef __VSX__ + Packet4f ret; + __asm__ ("xvcmpgesp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); + return ret; + #else + return vec_min(a, b); + #endif +} template<> EIGEN_STRONG_INLINE Packet4i pmin(const Packet4i& a, const Packet4i& b) { return vec_min(a, b); } -template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet4f pmax(const Packet4f& a, const Packet4f& b) +{ + #ifdef __VSX__ + Packet4f ret; + __asm__ ("xvcmpgtsp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); + return ret; + #else + return vec_max(a, b); + #endif +} template<> EIGEN_STRONG_INLINE Packet4i pmax(const Packet4i& a, const Packet4i& b) { return vec_max(a, b); } template<> EIGEN_STRONG_INLINE Packet4f pand(const Packet4f& a, const Packet4f& b) { return vec_and(a, b); } @@ -450,15 +468,15 @@ template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) { Packet4f p; - if((ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); + if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); return vec_perm(p, p, p16uc_DUPLICATE32_HI); } template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) { Packet4i p; - if((ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); + if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); return vec_perm(p, p, p16uc_DUPLICATE32_HI); } @@ -764,7 +782,7 @@ typedef __vector __bool long Packet2bl; static Packet2l p2l_ONE = { 1, 1 }; static Packet2l p2l_ZERO = reinterpret_cast(p4i_ZERO); -static Packet2d p2d_ONE = { 1.0, 1.0 }; +static Packet2d p2d_ONE = { 1.0, 1.0 }; static Packet2d p2d_ZERO = reinterpret_cast(p4f_ZERO); static Packet2d p2d_MZERO = { -0.0, -0.0 }; @@ -910,9 +928,19 @@ template<> EIGEN_STRONG_INLINE Packet2d pdiv(const Packet2d& a, const // for some weird raisons, it has to be overloaded for packet of integers template<> EIGEN_STRONG_INLINE Packet2d pmadd(const Packet2d& a, const Packet2d& b, const Packet2d& c) { return vec_madd(a, b, c); } -template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) { return vec_min(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmin(const Packet2d& a, const Packet2d& b) +{ + Packet2d ret; + __asm__ ("xvcmpgedp %x0,%x1,%x2\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); + return ret; + } -template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) { return vec_max(a, b); } +template<> EIGEN_STRONG_INLINE Packet2d pmax(const Packet2d& a, const Packet2d& b) +{ + Packet2d ret; + __asm__ ("xvcmpgtdp %x0,%x2,%x1\n\txxsel %x0,%x1,%x2,%x0" : "=&wa" (ret) : "wa" (a), "wa" (b)); + return ret; +} template<> EIGEN_STRONG_INLINE Packet2d pand(const Packet2d& a, const Packet2d& b) { return vec_and(a, b); } @@ -935,8 +963,8 @@ template<> EIGEN_STRONG_INLINE Packet2d ploadu(const double* from) template<> EIGEN_STRONG_INLINE Packet2d ploaddup(const double* from) { Packet2d p; - if((ptrdiff_t(from) % 16) == 0) p = pload(from); - else p = ploadu(from); + if((std::ptrdiff_t(from) % 16) == 0) p = pload(from); + else p = ploadu(from); return vec_splat_dbl<0>(p); } @@ -969,7 +997,7 @@ template<> EIGEN_STRONG_INLINE Packet2d preduxp(const Packet2d* vecs) Packet2d v[2], sum; v[0] = vecs[0] + reinterpret_cast(vec_sld(reinterpret_cast(vecs[0]), reinterpret_cast(vecs[0]), 8)); v[1] = vecs[1] + reinterpret_cast(vec_sld(reinterpret_cast(vecs[1]), reinterpret_cast(vecs[1]), 8)); - + #ifdef _BIG_ENDIAN sum = reinterpret_cast(vec_sld(reinterpret_cast(v[0]), reinterpret_cast(v[1]), 8)); #else @@ -1022,7 +1050,7 @@ ptranspose(PacketBlock& kernel) { template<> EIGEN_STRONG_INLINE Packet2d pblend(const Selector<2>& ifPacket, const Packet2d& thenPacket, const Packet2d& elsePacket) { Packet2l select = { ifPacket.select[0], ifPacket.select[1] }; - Packet2bl mask = vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p2l_ONE)); + Packet2bl mask = reinterpret_cast( vec_cmpeq(reinterpret_cast(select), reinterpret_cast(p2l_ONE)) ); return vec_sel(elsePacket, thenPacket, mask); } #endif // __VSX__ diff --git a/eigenlib/Eigen/src/Core/arch/CUDA/Half.h b/eigenlib/Eigen/src/Core/arch/CUDA/Half.h index 52892db3..59717b4f 100644 --- a/eigenlib/Eigen/src/Core/arch/CUDA/Half.h +++ b/eigenlib/Eigen/src/Core/arch/CUDA/Half.h @@ -13,7 +13,7 @@ // Redistribution and use in source and binary forms, with or without // modification, are permitted. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -// “AS IS” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, @@ -29,7 +29,7 @@ // type Eigen::half (inheriting from CUDA's __half struct) with // operator overloads such that it behaves basically as an arithmetic // type. It will be quite slow on CPUs (so it is recommended to stay -// in fp32 for CPUs, except for simple parameter conversions, I/O +// in float32_bits for CPUs, except for simple parameter conversions, I/O // to disk and the likes), but fast on GPUs. @@ -42,6 +42,7 @@ #define EIGEN_EXPLICIT_CAST(tgt_type) operator tgt_type() #endif +#include namespace Eigen { @@ -50,38 +51,45 @@ struct half; namespace half_impl { #if !defined(EIGEN_HAS_CUDA_FP16) - -// Make our own __half definition that is similar to CUDA's. -struct __half { - EIGEN_DEVICE_FUNC __half() {} - explicit EIGEN_DEVICE_FUNC __half(unsigned short raw) : x(raw) {} +// Make our own __half_raw definition that is similar to CUDA's. +struct __half_raw { + EIGEN_DEVICE_FUNC __half_raw() : x(0) {} + explicit EIGEN_DEVICE_FUNC __half_raw(unsigned short raw) : x(raw) {} unsigned short x; }; - +#elif defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000 +// In CUDA < 9.0, __half is the equivalent of CUDA 9's __half_raw +typedef __half __half_raw; #endif -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x); -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff); -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h); -struct half_base : public __half { +struct half_base : public __half_raw { EIGEN_DEVICE_FUNC half_base() {} - EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half(h) {} - EIGEN_DEVICE_FUNC half_base(const __half& h) : __half(h) {} + EIGEN_DEVICE_FUNC half_base(const half_base& h) : __half_raw(h) {} + EIGEN_DEVICE_FUNC half_base(const __half_raw& h) : __half_raw(h) {} +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + EIGEN_DEVICE_FUNC half_base(const __half& h) : __half_raw(*(__half_raw*)&h) {} +#endif }; } // namespace half_impl // Class definition. struct half : public half_impl::half_base { - #if !defined(EIGEN_HAS_CUDA_FP16) - typedef half_impl::__half __half; + #if !defined(EIGEN_HAS_CUDA_FP16) || (defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER < 90000) + typedef half_impl::__half_raw __half_raw; #endif EIGEN_DEVICE_FUNC half() {} - EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} + EIGEN_DEVICE_FUNC half(const __half_raw& h) : half_impl::half_base(h) {} EIGEN_DEVICE_FUNC half(const half& h) : half_impl::half_base(h) {} +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDACC_VER) && EIGEN_CUDACC_VER >= 90000 + EIGEN_DEVICE_FUNC half(const __half& h) : half_impl::half_base(h) {} +#endif explicit EIGEN_DEVICE_FUNC half(bool b) : half_impl::half_base(half_impl::raw_uint16_to_half(b ? 0x3c00 : 0)) {} @@ -138,71 +146,125 @@ struct half : public half_impl::half_base { } }; +} // end namespace Eigen + +namespace std { +template<> +struct numeric_limits { + static const bool is_specialized = true; + static const bool is_signed = true; + static const bool is_integer = false; + static const bool is_exact = false; + static const bool has_infinity = true; + static const bool has_quiet_NaN = true; + static const bool has_signaling_NaN = true; + static const float_denorm_style has_denorm = denorm_present; + static const bool has_denorm_loss = false; + static const std::float_round_style round_style = std::round_to_nearest; + static const bool is_iec559 = false; + static const bool is_bounded = false; + static const bool is_modulo = false; + static const int digits = 11; + static const int digits10 = 3; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int max_digits10 = 5; // according to http://half.sourceforge.net/structstd_1_1numeric__limits_3_01half__float_1_1half_01_4.html + static const int radix = 2; + static const int min_exponent = -13; + static const int min_exponent10 = -4; + static const int max_exponent = 16; + static const int max_exponent10 = 4; + static const bool traps = true; + static const bool tinyness_before = false; + + static Eigen::half (min)() { return Eigen::half_impl::raw_uint16_to_half(0x400); } + static Eigen::half lowest() { return Eigen::half_impl::raw_uint16_to_half(0xfbff); } + static Eigen::half (max)() { return Eigen::half_impl::raw_uint16_to_half(0x7bff); } + static Eigen::half epsilon() { return Eigen::half_impl::raw_uint16_to_half(0x0800); } + static Eigen::half round_error() { return Eigen::half(0.5); } + static Eigen::half infinity() { return Eigen::half_impl::raw_uint16_to_half(0x7c00); } + static Eigen::half quiet_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } + static Eigen::half signaling_NaN() { return Eigen::half_impl::raw_uint16_to_half(0x7e00); } + static Eigen::half denorm_min() { return Eigen::half_impl::raw_uint16_to_half(0x1); } +}; + +// If std::numeric_limits is specialized, should also specialize +// std::numeric_limits, std::numeric_limits, and +// std::numeric_limits +// https://stackoverflow.com/a/16519653/ +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +template<> +struct numeric_limits : numeric_limits {}; +} // end namespace std + +namespace Eigen { + namespace half_impl { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 // Intrinsics for native fp16 support. Note that on current hardware, -// these are no faster than fp32 arithmetic (you need to use the half2 +// these are no faster than float32_bits arithmetic (you need to use the half2 // versions to get the ALU speed increased), but you do save the // conversion steps back and forth. -__device__ half operator + (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half operator + (const half& a, const half& b) { return __hadd(a, b); } -__device__ half operator * (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half operator * (const half& a, const half& b) { return __hmul(a, b); } -__device__ half operator - (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half operator - (const half& a, const half& b) { return __hsub(a, b); } -__device__ half operator / (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half operator / (const half& a, const half& b) { float num = __half2float(a); float denom = __half2float(b); return __float2half(num / denom); } -__device__ half operator - (const half& a) { +EIGEN_STRONG_INLINE __device__ half operator - (const half& a) { return __hneg(a); } -__device__ half& operator += (half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half& operator += (half& a, const half& b) { a = a + b; return a; } -__device__ half& operator *= (half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half& operator *= (half& a, const half& b) { a = a * b; return a; } -__device__ half& operator -= (half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half& operator -= (half& a, const half& b) { a = a - b; return a; } -__device__ half& operator /= (half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ half& operator /= (half& a, const half& b) { a = a / b; return a; } -__device__ bool operator == (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator == (const half& a, const half& b) { return __heq(a, b); } -__device__ bool operator != (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator != (const half& a, const half& b) { return __hne(a, b); } -__device__ bool operator < (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator < (const half& a, const half& b) { return __hlt(a, b); } -__device__ bool operator <= (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator <= (const half& a, const half& b) { return __hle(a, b); } -__device__ bool operator > (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator > (const half& a, const half& b) { return __hgt(a, b); } -__device__ bool operator >= (const half& a, const half& b) { +EIGEN_STRONG_INLINE __device__ bool operator >= (const half& a, const half& b) { return __hge(a, b); } #else // Emulate support for half floats // Definitions for CPUs and older CUDA, mostly working through conversion -// to/from fp32. +// to/from float32_bits. EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator + (const half& a, const half& b) { return half(float(a) + float(b)); @@ -238,10 +300,10 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half& operator /= (half& a, const half& b) return a; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator == (const half& a, const half& b) { - return float(a) == float(b); + return numext::equal_strict(float(a),float(b)); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator != (const half& a, const half& b) { - return float(a) != float(b); + return numext::not_equal_strict(float(a), float(b)); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool operator < (const half& a, const half& b) { return float(a) < float(b); @@ -269,34 +331,35 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half operator / (const half& a, Index b) { // these in hardware. If we need more performance on older/other CPUs, they are // also possible to vectorize directly. -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half raw_uint16_to_half(unsigned short x) { - __half h; +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw raw_uint16_to_half(unsigned short x) { + __half_raw h; h.x = x; return h; } -union FP32 { +union float32_bits { unsigned int u; float f; }; -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - return __float2half(ff); +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half_raw float_to_half_rtne(float ff) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 + __half tmp_ff = __float2half(ff); + return *(__half_raw*)&tmp_ff; #elif defined(EIGEN_HAS_FP16_C) - __half h; + __half_raw h; h.x = _cvtss_sh(ff, 0); return h; #else - FP32 f; f.f = ff; + float32_bits f; f.f = ff; - const FP32 f32infty = { 255 << 23 }; - const FP32 f16max = { (127 + 16) << 23 }; - const FP32 denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; + const float32_bits f32infty = { 255 << 23 }; + const float32_bits f16max = { (127 + 16) << 23 }; + const float32_bits denorm_magic = { ((127 - 15) + (23 - 10) + 1) << 23 }; unsigned int sign_mask = 0x80000000u; - __half o; + __half_raw o; o.x = static_cast(0x0u); unsigned int sign = f.u & sign_mask; @@ -335,17 +398,17 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC __half float_to_half_rtne(float ff) { #endif } -EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half h) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC float half_to_float(__half_raw h) { +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 return __half2float(h); #elif defined(EIGEN_HAS_FP16_C) return _cvtsh_ss(h.x); #else - const FP32 magic = { 113 << 23 }; + const float32_bits magic = { 113 << 23 }; const unsigned int shifted_exp = 0x7c00 << 13; // exponent mask after shift - FP32 o; + float32_bits o; o.u = (h.x & 0x7fff) << 13; // exponent/mantissa bits unsigned int exp = shifted_exp & o.u; // just the exponent @@ -370,7 +433,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isinf)(const half& a) { return (a.x & 0x7fff) == 0x7c00; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool (isnan)(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return __hisnan(a); #else return (a.x & 0x7fff) > 0x7c00; @@ -386,11 +449,15 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half abs(const half& a) { return result; } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half exp(const half& a) { - return half(::expf(float(a))); +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 + return half(hexp(a)); +#else + return half(::expf(float(a))); +#endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log(const half& a) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 - return Eigen::half(::hlog(a)); +#if defined(EIGEN_HAS_CUDA_FP16) && EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 + return half(::hlog(a)); #else return half(::logf(float(a))); #endif @@ -402,7 +469,11 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half log10(const half& a) { return half(::log10f(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half sqrt(const half& a) { - return half(::sqrtf(float(a))); +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 + return half(hsqrt(a)); +#else + return half(::sqrtf(float(a))); +#endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half pow(const half& a, const half& b) { return half(::powf(float(a), float(b))); @@ -420,14 +491,22 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half tanh(const half& a) { return half(::tanhf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half floor(const half& a) { +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 + return half(hfloor(a)); +#else return half(::floorf(float(a))); +#endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half ceil(const half& a) { +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 300 + return half(hceil(a)); +#else return half(::ceilf(float(a))); +#endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return __hlt(b, a) ? b : a; #else const float f1 = static_cast(a); @@ -436,7 +515,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (min)(const half& a, const half& b) { #endif } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC half (max)(const half& a, const half& b) { -#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if defined(EIGEN_HAS_CUDA_FP16) && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return __hlt(a, b) ? b : a; #else const float f1 = static_cast(a); @@ -477,6 +556,13 @@ template<> struct is_arithmetic { enum { value = true }; }; template<> struct NumTraits : GenericNumTraits { + enum { + IsSigned = true, + IsInteger = false, + IsComplex = false, + RequireInitialization = false + }; + EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Eigen::half epsilon() { return half_impl::raw_uint16_to_half(0x0800); } @@ -507,7 +593,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half exph(const Eigen::half& a) { return Eigen::half(::expf(float(a))); } EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half logh(const Eigen::half& a) { -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +#if EIGEN_CUDACC_VER >= 80000 && defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 530 return Eigen::half(::hlog(a)); #else return Eigen::half(::logf(float(a))); @@ -541,14 +627,18 @@ struct hash { // Add the missing shfl_xor intrinsic -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 300 __device__ EIGEN_STRONG_INLINE Eigen::half __shfl_xor(Eigen::half var, int laneMask, int width=warpSize) { + #if EIGEN_CUDACC_VER < 90000 return static_cast(__shfl_xor(static_cast(var), laneMask, width)); + #else + return static_cast(__shfl_xor_sync(0xFFFFFFFF, static_cast(var), laneMask, width)); + #endif } #endif -// ldg() has an overload for __half, but we also need one for Eigen::half. -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350 +// ldg() has an overload for __half_raw, but we also need one for Eigen::half. +#if defined(EIGEN_CUDA_ARCH) && EIGEN_CUDA_ARCH >= 350 EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) { return Eigen::half_impl::raw_uint16_to_half( __ldg(reinterpret_cast(ptr))); @@ -556,7 +646,7 @@ EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half __ldg(const Eigen::half* ptr) #endif -#if defined(__CUDA_ARCH__) +#if defined(EIGEN_CUDA_ARCH) namespace Eigen { namespace numext { diff --git a/eigenlib/Eigen/src/Core/arch/CUDA/PacketMath.h b/eigenlib/Eigen/src/Core/arch/CUDA/PacketMath.h index ad66399e..4dda6318 100644 --- a/eigenlib/Eigen/src/Core/arch/CUDA/PacketMath.h +++ b/eigenlib/Eigen/src/Core/arch/CUDA/PacketMath.h @@ -291,7 +291,7 @@ template<> EIGEN_DEVICE_FUNC inline double2 pabs(const double2& a) { EIGEN_DEVICE_FUNC inline void ptranspose(PacketBlock& kernel) { - double tmp = kernel.packet[0].y; + float tmp = kernel.packet[0].y; kernel.packet[0].y = kernel.packet[1].x; kernel.packet[1].x = tmp; diff --git a/eigenlib/Eigen/src/Core/arch/CUDA/PacketMathHalf.h b/eigenlib/Eigen/src/Core/arch/CUDA/PacketMathHalf.h index ae54225f..f749c573 100644 --- a/eigenlib/Eigen/src/Core/arch/CUDA/PacketMathHalf.h +++ b/eigenlib/Eigen/src/Core/arch/CUDA/PacketMathHalf.h @@ -99,7 +99,8 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst(const half2& template<> __device__ EIGEN_STRONG_INLINE half2 pabs(const half2& a) { half2 result; - result.x = a.x & 0x7FFF7FFF; + unsigned temp = *(reinterpret_cast(&(a))); + *(reinterpret_cast(&(result))) = temp & 0x7FFF7FFF; return result; } @@ -229,7 +230,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux(const half2& #else float a1 = __low2float(a); float a2 = __high2float(a); - return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2))); + return Eigen::half(__float2half_rn(a1 + a2)); #endif } @@ -263,7 +264,7 @@ template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul(const ha #else float a1 = __low2float(a); float a2 = __high2float(a); - return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2))); + return Eigen::half(__float2half_rn(a1 * a2)); #endif } @@ -275,7 +276,7 @@ template<> __device__ EIGEN_STRONG_INLINE half2 plog1p(const half2& a) { return __floats2half2_rn(r1, r2); } -#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000 && defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 530 +#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530 template<> __device__ EIGEN_STRONG_INLINE half2 plog(const half2& a) { diff --git a/eigenlib/Eigen/src/Core/arch/Default/ConjHelper.h b/eigenlib/Eigen/src/Core/arch/Default/ConjHelper.h new file mode 100644 index 00000000..4cfe34e0 --- /dev/null +++ b/eigenlib/Eigen/src/Core/arch/Default/ConjHelper.h @@ -0,0 +1,29 @@ + +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2017 Gael Guennebaud +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#ifndef EIGEN_ARCH_CONJ_HELPER_H +#define EIGEN_ARCH_CONJ_HELPER_H + +#define EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(PACKET_CPLX, PACKET_REAL) \ + template<> struct conj_helper { \ + EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_REAL& x, const PACKET_CPLX& y, const PACKET_CPLX& c) const \ + { return padd(c, pmul(x,y)); } \ + EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_REAL& x, const PACKET_CPLX& y) const \ + { return PACKET_CPLX(Eigen::internal::pmul(x, y.v)); } \ + }; \ + \ + template<> struct conj_helper { \ + EIGEN_STRONG_INLINE PACKET_CPLX pmadd(const PACKET_CPLX& x, const PACKET_REAL& y, const PACKET_CPLX& c) const \ + { return padd(c, pmul(x,y)); } \ + EIGEN_STRONG_INLINE PACKET_CPLX pmul(const PACKET_CPLX& x, const PACKET_REAL& y) const \ + { return PACKET_CPLX(Eigen::internal::pmul(x.v, y)); } \ + }; + +#endif // EIGEN_ARCH_CONJ_HELPER_H diff --git a/eigenlib/Eigen/src/Core/arch/NEON/Complex.h b/eigenlib/Eigen/src/Core/arch/NEON/Complex.h index 57e9b431..306a309b 100644 --- a/eigenlib/Eigen/src/Core/arch/NEON/Complex.h +++ b/eigenlib/Eigen/src/Core/arch/NEON/Complex.h @@ -67,7 +67,7 @@ template<> struct unpacket_traits { typedef std::complex type; template<> EIGEN_STRONG_INLINE Packet2cf pset1(const std::complex& from) { float32x2_t r64; - r64 = vld1_f32((float *)&from); + r64 = vld1_f32((const float *)&from); return Packet2cf(vcombine_f32(r64, r64)); } @@ -142,7 +142,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf to[stride*1] = std::complex(vgetq_lane_f32(from.v, 2), vgetq_lane_f32(from.v, 3)); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((float *)addr); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((const float *)addr); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { @@ -265,6 +265,8 @@ template<> struct conj_helper } }; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) + template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { // TODO optimize it for NEON @@ -275,7 +277,7 @@ template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, con s = vmulq_f32(b.v, b.v); rev_s = vrev64q_f32(s); - return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); + return Packet2cf(pdiv(res.v, vaddq_f32(s,rev_s))); } EIGEN_DEVICE_FUNC inline void @@ -381,7 +383,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, from.v); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, from.v); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((double *)addr); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { EIGEN_ARM_PREFETCH((const double *)addr); } template<> EIGEN_DEVICE_FUNC inline Packet1cd pgather, Packet1cd>(const std::complex* from, Index stride) { @@ -456,6 +458,8 @@ template<> struct conj_helper } }; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) + template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for NEON diff --git a/eigenlib/Eigen/src/Core/arch/NEON/PacketMath.h b/eigenlib/Eigen/src/Core/arch/NEON/PacketMath.h index d392bf3f..3d5ed0d2 100644 --- a/eigenlib/Eigen/src/Core/arch/NEON/PacketMath.h +++ b/eigenlib/Eigen/src/Core/arch/NEON/PacketMath.h @@ -36,29 +36,63 @@ namespace internal { #endif #endif +#if EIGEN_COMP_MSVC + +// In MSVC's arm_neon.h header file, all NEON vector types +// are aliases to the same underlying type __n128. +// We thus have to wrap them to make them different C++ types. +// (See also bug 1428) + +template +struct eigen_packet_wrapper +{ + operator T&() { return m_val; } + operator const T&() const { return m_val; } + eigen_packet_wrapper() {} + eigen_packet_wrapper(const T &v) : m_val(v) {} + eigen_packet_wrapper& operator=(const T &v) { + m_val = v; + return *this; + } + + T m_val; +}; +typedef eigen_packet_wrapper Packet2f; +typedef eigen_packet_wrapper Packet4f; +typedef eigen_packet_wrapper Packet4i; +typedef eigen_packet_wrapper Packet2i; +typedef eigen_packet_wrapper Packet4ui; + +#else + typedef float32x2_t Packet2f; typedef float32x4_t Packet4f; typedef int32x4_t Packet4i; typedef int32x2_t Packet2i; typedef uint32x4_t Packet4ui; +#endif // EIGEN_COMP_MSVC + #define _EIGEN_DECLARE_CONST_Packet4f(NAME,X) \ const Packet4f p4f_##NAME = pset1(X) #define _EIGEN_DECLARE_CONST_Packet4f_FROM_INT(NAME,X) \ - const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1(X)) + const Packet4f p4f_##NAME = vreinterpretq_f32_u32(pset1(X)) #define _EIGEN_DECLARE_CONST_Packet4i(NAME,X) \ const Packet4i p4i_##NAME = pset1(X) -// arm64 does have the pld instruction. If available, let's trust the __builtin_prefetch built-in function -// which available on LLVM and GCC (at least) -#if EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC +#if EIGEN_ARCH_ARM64 + // __builtin_prefetch tends to do nothing on ARM64 compilers because the + // prefetch instructions there are too detailed for __builtin_prefetch to map + // meaningfully to them. + #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__("prfm pldl1keep, [%[addr]]\n" ::[addr] "r"(ADDR) : ); +#elif EIGEN_HAS_BUILTIN(__builtin_prefetch) || EIGEN_COMP_GNUC #define EIGEN_ARM_PREFETCH(ADDR) __builtin_prefetch(ADDR); #elif defined __pld #define EIGEN_ARM_PREFETCH(ADDR) __pld(ADDR) -#elif !EIGEN_ARCH_ARM64 - #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ( " pld [%[addr]]\n" :: [addr] "r" (ADDR) : "cc" ); +#elif EIGEN_ARCH_ARM32 + #define EIGEN_ARM_PREFETCH(ADDR) __asm__ __volatile__ ("pld [%[addr]]\n" :: [addr] "r" (ADDR) : ); #else // by default no explicit prefetching #define EIGEN_ARM_PREFETCH(ADDR) @@ -83,7 +117,7 @@ template<> struct packet_traits : default_packet_traits HasSqrt = 0 }; }; -template<> struct packet_traits : default_packet_traits +template<> struct packet_traits : default_packet_traits { typedef Packet4i type; typedef Packet4i half; // Packet2i intrinsics not implemented yet @@ -105,19 +139,19 @@ EIGEN_STRONG_INLINE void vst1q_f32(float* to, float32x4_t from) { ::vst1q EIGEN_STRONG_INLINE void vst1_f32 (float* to, float32x2_t from) { ::vst1_f32 ((float32_t*)to,from); } #endif -template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; -template<> struct unpacket_traits { typedef int type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; +template<> struct unpacket_traits { typedef float type; enum {size=4, alignment=Aligned16}; typedef Packet4f half; }; +template<> struct unpacket_traits { typedef int32_t type; enum {size=4, alignment=Aligned16}; typedef Packet4i half; }; template<> EIGEN_STRONG_INLINE Packet4f pset1(const float& from) { return vdupq_n_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i pset1(const int& from) { return vdupq_n_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4i pset1(const int32_t& from) { return vdupq_n_s32(from); } template<> EIGEN_STRONG_INLINE Packet4f plset(const float& a) { - const float32_t f[] = {0, 1, 2, 3}; + const float f[] = {0, 1, 2, 3}; Packet4f countdown = vld1q_f32(f); return vaddq_f32(pset1(a), countdown); } -template<> EIGEN_STRONG_INLINE Packet4i plset(const int& a) +template<> EIGEN_STRONG_INLINE Packet4i plset(const int32_t& a) { const int32_t i[] = {0, 1, 2, 3}; Packet4i countdown = vld1q_s32(i); @@ -240,20 +274,20 @@ template<> EIGEN_STRONG_INLINE Packet4f pandnot(const Packet4f& a, con } template<> EIGEN_STRONG_INLINE Packet4i pandnot(const Packet4i& a, const Packet4i& b) { return vbicq_s32(a,b); } -template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i pload(const int* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4f pload(const float* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i pload(const int32_t* from) { EIGEN_DEBUG_ALIGNED_LOAD return vld1q_s32(from); } -template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } -template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } +template<> EIGEN_STRONG_INLINE Packet4f ploadu(const float* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_f32(from); } +template<> EIGEN_STRONG_INLINE Packet4i ploadu(const int32_t* from) { EIGEN_DEBUG_UNALIGNED_LOAD return vld1q_s32(from); } -template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) +template<> EIGEN_STRONG_INLINE Packet4f ploaddup(const float* from) { float32x2_t lo, hi; lo = vld1_dup_f32(from); hi = vld1_dup_f32(from+1); return vcombine_f32(lo, hi); } -template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) +template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int32_t* from) { int32x2_t lo, hi; lo = vld1_dup_s32(from); @@ -261,11 +295,11 @@ template<> EIGEN_STRONG_INLINE Packet4i ploaddup(const int* from) return vcombine_s32(lo, hi); } -template<> EIGEN_STRONG_INLINE void pstore(float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); } -template<> EIGEN_STRONG_INLINE void pstore(int* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); } +template<> EIGEN_STRONG_INLINE void pstore (float* to, const Packet4f& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_f32(to, from); } +template<> EIGEN_STRONG_INLINE void pstore(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_ALIGNED_STORE vst1q_s32(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } -template<> EIGEN_STRONG_INLINE void pstoreu(int* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu (float* to, const Packet4f& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_f32(to, from); } +template<> EIGEN_STRONG_INLINE void pstoreu(int32_t* to, const Packet4i& from) { EIGEN_DEBUG_UNALIGNED_STORE vst1q_s32(to, from); } template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const float* from, Index stride) { @@ -276,7 +310,7 @@ template<> EIGEN_DEVICE_FUNC inline Packet4f pgather(const floa res = vsetq_lane_f32(from[3*stride], res, 3); return res; } -template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int* from, Index stride) +template<> EIGEN_DEVICE_FUNC inline Packet4i pgather(const int32_t* from, Index stride) { Packet4i res = pset1(0); res = vsetq_lane_s32(from[0*stride], res, 0); @@ -293,7 +327,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(float* to, co to[stride*2] = vgetq_lane_f32(from, 2); to[stride*3] = vgetq_lane_f32(from, 3); } -template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const Packet4i& from, Index stride) +template<> EIGEN_DEVICE_FUNC inline void pscatter(int32_t* to, const Packet4i& from, Index stride) { to[stride*0] = vgetq_lane_s32(from, 0); to[stride*1] = vgetq_lane_s32(from, 1); @@ -301,12 +335,12 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter(int* to, const to[stride*3] = vgetq_lane_s32(from, 3); } -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { EIGEN_ARM_PREFETCH(addr); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch (const float* addr) { EIGEN_ARM_PREFETCH(addr); } +template<> EIGEN_STRONG_INLINE void prefetch(const int32_t* addr) { EIGEN_ARM_PREFETCH(addr); } // FIXME only store the 2 first elements ? -template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; } -template<> EIGEN_STRONG_INLINE int pfirst(const Packet4i& a) { int EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE float pfirst(const Packet4f& a) { float EIGEN_ALIGN16 x[4]; vst1q_f32(x, a); return x[0]; } +template<> EIGEN_STRONG_INLINE int32_t pfirst(const Packet4i& a) { int32_t EIGEN_ALIGN16 x[4]; vst1q_s32(x, a); return x[0]; } template<> EIGEN_STRONG_INLINE Packet4f preverse(const Packet4f& a) { float32x2_t a_lo, a_hi; @@ -361,7 +395,7 @@ template<> EIGEN_STRONG_INLINE Packet4f preduxp(const Packet4f* vecs) return sum; } -template<> EIGEN_STRONG_INLINE int predux(const Packet4i& a) +template<> EIGEN_STRONG_INLINE int32_t predux(const Packet4i& a) { int32x2_t a_lo, a_hi, sum; @@ -408,7 +442,7 @@ template<> EIGEN_STRONG_INLINE float predux_mul(const Packet4f& a) return vget_lane_f32(prod, 0); } -template<> EIGEN_STRONG_INLINE int predux_mul(const Packet4i& a) +template<> EIGEN_STRONG_INLINE int32_t predux_mul(const Packet4i& a) { int32x2_t a_lo, a_hi, prod; @@ -436,7 +470,7 @@ template<> EIGEN_STRONG_INLINE float predux_min(const Packet4f& a) return vget_lane_f32(min, 0); } -template<> EIGEN_STRONG_INLINE int predux_min(const Packet4i& a) +template<> EIGEN_STRONG_INLINE int32_t predux_min(const Packet4i& a) { int32x2_t a_lo, a_hi, min; @@ -461,7 +495,7 @@ template<> EIGEN_STRONG_INLINE float predux_max(const Packet4f& a) return vget_lane_f32(max, 0); } -template<> EIGEN_STRONG_INLINE int predux_max(const Packet4i& a) +template<> EIGEN_STRONG_INLINE int32_t predux_max(const Packet4i& a) { int32x2_t a_lo, a_hi, max; diff --git a/eigenlib/Eigen/src/Core/arch/SSE/Complex.h b/eigenlib/Eigen/src/Core/arch/SSE/Complex.h index 5607fe0a..d075043c 100644 --- a/eigenlib/Eigen/src/Core/arch/SSE/Complex.h +++ b/eigenlib/Eigen/src/Core/arch/SSE/Complex.h @@ -128,7 +128,7 @@ template<> EIGEN_DEVICE_FUNC inline void pscatter, Packet2cf _mm_cvtss_f32(_mm_shuffle_ps(from.v, from.v, 3))); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet2cf& a) { @@ -229,23 +229,7 @@ template<> struct conj_helper } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet4f& x, const Packet2cf& y, const Packet2cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet4f& x, const Packet2cf& y) const - { return Packet2cf(Eigen::internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet2cf pmadd(const Packet2cf& x, const Packet4f& y, const Packet2cf& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet2cf pmul(const Packet2cf& x, const Packet4f& y) const - { return Packet2cf(Eigen::internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) template<> EIGEN_STRONG_INLINE Packet2cf pdiv(const Packet2cf& a, const Packet2cf& b) { @@ -340,7 +324,7 @@ template<> EIGEN_STRONG_INLINE Packet1cd ploaddup(const std::complex< template<> EIGEN_STRONG_INLINE void pstore >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_ALIGNED_STORE pstore((double*)to, Packet2d(from.v)); } template<> EIGEN_STRONG_INLINE void pstoreu >(std::complex * to, const Packet1cd& from) { EIGEN_DEBUG_UNALIGNED_STORE pstoreu((double*)to, Packet2d(from.v)); } -template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch >(const std::complex * addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } template<> EIGEN_STRONG_INLINE std::complex pfirst(const Packet1cd& a) { @@ -430,23 +414,7 @@ template<> struct conj_helper } }; -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet2d& x, const Packet1cd& y, const Packet1cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet2d& x, const Packet1cd& y) const - { return Packet1cd(Eigen::internal::pmul(x, y.v)); } -}; - -template<> struct conj_helper -{ - EIGEN_STRONG_INLINE Packet1cd pmadd(const Packet1cd& x, const Packet2d& y, const Packet1cd& c) const - { return padd(c, pmul(x,y)); } - - EIGEN_STRONG_INLINE Packet1cd pmul(const Packet1cd& x, const Packet2d& y) const - { return Packet1cd(Eigen::internal::pmul(x.v, y)); } -}; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { diff --git a/eigenlib/Eigen/src/Core/arch/SSE/PacketMath.h b/eigenlib/Eigen/src/Core/arch/SSE/PacketMath.h index 3832de14..60e2517e 100755 --- a/eigenlib/Eigen/src/Core/arch/SSE/PacketMath.h +++ b/eigenlib/Eigen/src/Core/arch/SSE/PacketMath.h @@ -28,7 +28,7 @@ namespace internal { #endif #endif -#if (defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004) +#if ((defined EIGEN_VECTORIZE_AVX) && (EIGEN_COMP_GNUC_STRICT || EIGEN_COMP_MINGW) && (__GXX_ABI_VERSION < 1004)) || EIGEN_OS_QNX // With GCC's default ABI version, a __m128 or __m256 are the same types and therefore we cannot // have overloads for both types without linking error. // One solution is to increase ABI version using -fabi-version=4 (or greater). @@ -409,10 +409,16 @@ template<> EIGEN_STRONG_INLINE void pstore1(double* to, const double& pstore(to, Packet2d(vec2d_swizzle1(pa,0,0))); } +#if EIGEN_COMP_PGI +typedef const void * SsePrefetchPtrType; +#else +typedef const char * SsePrefetchPtrType; +#endif + #ifndef EIGEN_VECTORIZE_AVX -template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } -template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((const char*)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const float* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const double* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } +template<> EIGEN_STRONG_INLINE void prefetch(const int* addr) { _mm_prefetch((SsePrefetchPtrType)(addr), _MM_HINT_T0); } #endif #if EIGEN_COMP_MSVC_STRICT && EIGEN_OS_WIN64 @@ -876,4 +882,14 @@ template<> EIGEN_STRONG_INLINE double pmadd(const double& a, const double& b, co } // end namespace Eigen +#if EIGEN_COMP_PGI +// PGI++ does not define the following intrinsics in C++ mode. +static inline __m128 _mm_castpd_ps (__m128d x) { return reinterpret_cast<__m128&>(x); } +static inline __m128i _mm_castpd_si128(__m128d x) { return reinterpret_cast<__m128i&>(x); } +static inline __m128d _mm_castps_pd (__m128 x) { return reinterpret_cast<__m128d&>(x); } +static inline __m128i _mm_castps_si128(__m128 x) { return reinterpret_cast<__m128i&>(x); } +static inline __m128 _mm_castsi128_ps(__m128i x) { return reinterpret_cast<__m128&>(x); } +static inline __m128d _mm_castsi128_pd(__m128i x) { return reinterpret_cast<__m128d&>(x); } +#endif + #endif // EIGEN_PACKET_MATH_SSE_H diff --git a/eigenlib/Eigen/src/Core/arch/SSE/TypeCasting.h b/eigenlib/Eigen/src/Core/arch/SSE/TypeCasting.h index c8489323..c6ca8c71 100644 --- a/eigenlib/Eigen/src/Core/arch/SSE/TypeCasting.h +++ b/eigenlib/Eigen/src/Core/arch/SSE/TypeCasting.h @@ -14,6 +14,7 @@ namespace Eigen { namespace internal { +#ifndef EIGEN_VECTORIZE_AVX template <> struct type_casting_traits { enum { @@ -23,11 +24,6 @@ struct type_casting_traits { }; }; -template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { - return _mm_cvttps_epi32(a); -} - - template <> struct type_casting_traits { enum { @@ -37,11 +33,6 @@ struct type_casting_traits { }; }; -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { - return _mm_cvtepi32_ps(a); -} - - template <> struct type_casting_traits { enum { @@ -51,10 +42,6 @@ struct type_casting_traits { }; }; -template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet2d& a, const Packet2d& b) { - return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6)); -} - template <> struct type_casting_traits { enum { @@ -63,6 +50,19 @@ struct type_casting_traits { TgtCoeffRatio = 2 }; }; +#endif + +template<> EIGEN_STRONG_INLINE Packet4i pcast(const Packet4f& a) { + return _mm_cvttps_epi32(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet4i& a) { + return _mm_cvtepi32_ps(a); +} + +template<> EIGEN_STRONG_INLINE Packet4f pcast(const Packet2d& a, const Packet2d& b) { + return _mm_shuffle_ps(_mm_cvtpd_ps(a), _mm_cvtpd_ps(b), (1 << 2) | (1 << 6)); +} template<> EIGEN_STRONG_INLINE Packet2d pcast(const Packet4f& a) { // Simply discard the second half of the input diff --git a/eigenlib/Eigen/src/Core/arch/ZVector/Complex.h b/eigenlib/Eigen/src/Core/arch/ZVector/Complex.h index d39d2d10..1bfb7339 100644 --- a/eigenlib/Eigen/src/Core/arch/ZVector/Complex.h +++ b/eigenlib/Eigen/src/Core/arch/ZVector/Complex.h @@ -336,6 +336,9 @@ template<> struct conj_helper } }; +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet2cf,Packet4f) +EIGEN_MAKE_CONJ_HELPER_CPLX_REAL(Packet1cd,Packet2d) + template<> EIGEN_STRONG_INLINE Packet1cd pdiv(const Packet1cd& a, const Packet1cd& b) { // TODO optimize it for AltiVec diff --git a/eigenlib/Eigen/src/Core/arch/ZVector/PacketMath.h b/eigenlib/Eigen/src/Core/arch/ZVector/PacketMath.h index e2deb25c..57b01fc6 100755 --- a/eigenlib/Eigen/src/Core/arch/ZVector/PacketMath.h +++ b/eigenlib/Eigen/src/Core/arch/ZVector/PacketMath.h @@ -100,7 +100,7 @@ static Packet16uc p16uc_DUPLICATE32_HI = { 0,1,2,3, 0,1,2,3, 4,5,6,7, 4,5,6,7 }; // Mask alignment #define _EIGEN_MASK_ALIGNMENT 0xfffffffffffffff0 -#define _EIGEN_ALIGNED_PTR(x) ((ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) +#define _EIGEN_ALIGNED_PTR(x) ((std::ptrdiff_t)(x) & _EIGEN_MASK_ALIGNMENT) // Handle endianness properly while loading constants // Define global static constants: diff --git a/eigenlib/Eigen/src/Core/functors/AssignmentFunctors.h b/eigenlib/Eigen/src/Core/functors/AssignmentFunctors.h index 9b373c78..4153b877 100644 --- a/eigenlib/Eigen/src/Core/functors/AssignmentFunctors.h +++ b/eigenlib/Eigen/src/Core/functors/AssignmentFunctors.h @@ -28,7 +28,7 @@ template struct assign_op { { internal::pstoret(a,b); } }; -// Empty overload for void type (used by PermutationMatrix +// Empty overload for void type (used by PermutationMatrix) template struct assign_op {}; template diff --git a/eigenlib/Eigen/src/Core/functors/BinaryFunctors.h b/eigenlib/Eigen/src/Core/functors/BinaryFunctors.h index 96747bac..3eae6b8c 100644 --- a/eigenlib/Eigen/src/Core/functors/BinaryFunctors.h +++ b/eigenlib/Eigen/src/Core/functors/BinaryFunctors.h @@ -255,7 +255,7 @@ struct scalar_cmp_op : binary_op_base struct scalar_hypot_op : binary_op_base { EIGEN_EMPTY_STRUCT_CTOR(scalar_hypot_op) -// typedef typename NumTraits::Real result_type; - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& _x, const Scalar& _y) const + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar &x, const Scalar &y) const { - EIGEN_USING_STD_MATH(sqrt) - Scalar p, qp; - if(_x>_y) - { - p = _x; - qp = _y / p; - } - else - { - p = _y; - qp = _x / p; - } - return p * sqrt(Scalar(1) + qp*qp); + // This functor is used by hypotNorm only for which it is faster to first apply abs + // on all coefficients prior to reduction through hypot. + // This way we avoid calling abs on positive and real entries, and this also permits + // to seamlessly handle complexes. Otherwise we would have to handle both real and complexes + // through the same functor... + return internal::positive_real_hypot(x,y); } }; template diff --git a/eigenlib/Eigen/src/Core/functors/NullaryFunctors.h b/eigenlib/Eigen/src/Core/functors/NullaryFunctors.h index 0311d903..b03be026 100644 --- a/eigenlib/Eigen/src/Core/functors/NullaryFunctors.h +++ b/eigenlib/Eigen/src/Core/functors/NullaryFunctors.h @@ -44,16 +44,16 @@ struct linspaced_op_impl { linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : m_low(low), m_high(high), m_size1(num_steps==1 ? 1 : num_steps-1), m_step(num_steps==1 ? Scalar() : (high-low)/Scalar(num_steps-1)), - m_interPacket(plset(0)), m_flip(numext::abs(high) EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (IndexType i) const { + typedef typename NumTraits::Real RealScalar; if(m_flip) - return (i==0)? m_low : (m_high - (m_size1-i)*m_step); + return (i==0)? m_low : (m_high - RealScalar(m_size1-i)*m_step); else - return (i==m_size1)? m_high : (m_low + i*m_step); + return (i==m_size1)? m_high : (m_low + RealScalar(i)*m_step); } template @@ -63,7 +63,7 @@ struct linspaced_op_impl // [low, ..., low] + ( [step, ..., step] * ( [i, ..., i] + [0, ..., size] ) ) if(m_flip) { - Packet pi = padd(pset1(Scalar(i-m_size1)),m_interPacket); + Packet pi = plset(Scalar(i-m_size1)); Packet res = padd(pset1(m_high), pmul(pset1(m_step), pi)); if(i==0) res = pinsertfirst(res, m_low); @@ -71,7 +71,7 @@ struct linspaced_op_impl } else { - Packet pi = padd(pset1(Scalar(i)),m_interPacket); + Packet pi = plset(Scalar(i)); Packet res = padd(pset1(m_low), pmul(pset1(m_step), pi)); if(i==m_size1-unpacket_traits::size+1) res = pinsertlast(res, m_high); @@ -83,7 +83,6 @@ struct linspaced_op_impl const Scalar m_high; const Index m_size1; const Scalar m_step; - const Packet m_interPacket; const bool m_flip; }; @@ -93,8 +92,8 @@ struct linspaced_op_impl linspaced_op_impl(const Scalar& low, const Scalar& high, Index num_steps) : m_low(low), m_multiplier((high-low)/convert_index(num_steps<=1 ? 1 : num_steps-1)), - m_divisor(convert_index(num_steps+high-low)/(high-low+1)), - m_use_divisor((high+1)<(low+num_steps)) + m_divisor(convert_index((high>=low?num_steps:-num_steps)+(high-low))/((numext::abs(high-low)+1)==0?1:(numext::abs(high-low)+1))), + m_use_divisor(num_steps>1 && (numext::abs(high-low)+1) diff --git a/eigenlib/Eigen/src/Core/functors/StlFunctors.h b/eigenlib/Eigen/src/Core/functors/StlFunctors.h index 6df3fa50..9c1d7585 100644 --- a/eigenlib/Eigen/src/Core/functors/StlFunctors.h +++ b/eigenlib/Eigen/src/Core/functors/StlFunctors.h @@ -83,13 +83,17 @@ struct functor_traits > { enum { Cost = functor_traits::Cost, PacketAccess = false }; }; #endif +#if (__cplusplus < 201703L) && (EIGEN_COMP_MSVC < 1910) +// std::unary_negate is deprecated since c++17 and will be removed in c++20 template struct functor_traits > { enum { Cost = 1 + functor_traits::Cost, PacketAccess = false }; }; +// std::binary_negate is deprecated since c++17 and will be removed in c++20 template struct functor_traits > { enum { Cost = 1 + functor_traits::Cost, PacketAccess = false }; }; +#endif #ifdef EIGEN_STDEXT_SUPPORT diff --git a/eigenlib/Eigen/src/Core/functors/UnaryFunctors.h b/eigenlib/Eigen/src/Core/functors/UnaryFunctors.h index 2e6a00ff..b56e7afd 100644 --- a/eigenlib/Eigen/src/Core/functors/UnaryFunctors.h +++ b/eigenlib/Eigen/src/Core/functors/UnaryFunctors.h @@ -768,7 +768,7 @@ struct scalar_sign_op { if (aa==real_type(0)) return Scalar(0); aa = real_type(1)/aa; - return Scalar(real(a)*aa, imag(a)*aa ); + return Scalar(a.real()*aa, a.imag()*aa ); } //TODO //template diff --git a/eigenlib/Eigen/src/Core/products/GeneralBlockPanelKernel.h b/eigenlib/Eigen/src/Core/products/GeneralBlockPanelKernel.h index 45230bce..681451cc 100644 --- a/eigenlib/Eigen/src/Core/products/GeneralBlockPanelKernel.h +++ b/eigenlib/Eigen/src/Core/products/GeneralBlockPanelKernel.h @@ -115,7 +115,8 @@ void evaluateProductBlockingSizesHeuristic(Index& k, Index& m, Index& n, Index n // registers. However once the latency is hidden there is no point in // increasing the value of k, so we'll cap it at 320 (value determined // experimentally). - const Index k_cache = (numext::mini)((l1-ksub)/kdiv, 320); + // To avoid that k vanishes, we make k_cache at least as big as kr + const Index k_cache = numext::maxi(kr, (numext::mini)((l1-ksub)/kdiv, 320)); if (k_cache < k) { k = k_cache - (k_cache % kr); eigen_internal_assert(k > 0); @@ -648,8 +649,8 @@ public: // Vectorized path EIGEN_STRONG_INLINE void loadRhs(const RhsScalar* b, DoublePacketType& dest) const { - dest.first = pset1(real(*b)); - dest.second = pset1(imag(*b)); + dest.first = pset1(numext::real(*b)); + dest.second = pset1(numext::imag(*b)); } EIGEN_STRONG_INLINE void loadRhsQuad(const RhsScalar* b, ResPacket& dest) const @@ -1197,10 +1198,16 @@ void gebp_kernel=6 without FMA (bug 1637) + #if EIGEN_GNUC_AT_LEAST(6,0) && defined(EIGEN_VECTORIZE_SSE) + #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND __asm__ ("" : [a0] "+x,m" (A0),[a1] "+x,m" (A1)); + #else + #define EIGEN_GEBP_2PX4_SPILLING_WORKAROUND + #endif + #define EIGEN_GEBGP_ONESTEP(K) \ do { \ EIGEN_ASM_COMMENT("begin step of gebp micro kernel 2pX4"); \ - EIGEN_ASM_COMMENT("Note: these asm comments work around bug 935!"); \ traits.loadLhs(&blA[(0+2*K)*LhsProgress], A0); \ traits.loadLhs(&blA[(1+2*K)*LhsProgress], A1); \ traits.broadcastRhs(&blB[(0+4*K)*RhsProgress], B_0, B1, B2, B3); \ @@ -1212,6 +1219,7 @@ void gebp_kernel::half SResPacketHalf; + const int SResPacketHalfSize = unpacket_traits::half>::size; if ((SwappedTraits::LhsProgress % 4) == 0 && (SwappedTraits::LhsProgress <= 8) && - (SwappedTraits::LhsProgress!=8 || unpacket_traits::size==nr)) + (SwappedTraits::LhsProgress!=8 || SResPacketHalfSize==nr)) { SAccPacket C0, C1, C2, C3; straits.initAcc(C0); diff --git a/eigenlib/Eigen/src/Core/products/GeneralMatrixMatrix.h b/eigenlib/Eigen/src/Core/products/GeneralMatrixMatrix.h index 61df3be5..ed6234c3 100644 --- a/eigenlib/Eigen/src/Core/products/GeneralMatrixMatrix.h +++ b/eigenlib/Eigen/src/Core/products/GeneralMatrixMatrix.h @@ -20,8 +20,9 @@ template class level3_blocking; template< typename Index, typename LhsScalar, int LhsStorageOrder, bool ConjugateLhs, - typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs> -struct general_matrix_matrix_product + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +struct general_matrix_matrix_product { typedef gebp_traits Traits; @@ -30,7 +31,7 @@ struct general_matrix_matrix_product& blocking, GemmParallelInfo* info = 0) @@ -39,8 +40,8 @@ struct general_matrix_matrix_product - ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking,info); + ColMajor,ResInnerStride> + ::run(cols,rows,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking,info); } }; @@ -49,8 +50,9 @@ struct general_matrix_matrix_product -struct general_matrix_matrix_product + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +struct general_matrix_matrix_product { typedef gebp_traits Traits; @@ -59,17 +61,17 @@ typedef typename ScalarBinaryOpTraits::ReturnType ResScala static void run(Index rows, Index cols, Index depth, const LhsScalar* _lhs, Index lhsStride, const RhsScalar* _rhs, Index rhsStride, - ResScalar* _res, Index resStride, + ResScalar* _res, Index resIncr, Index resStride, ResScalar alpha, level3_blocking& blocking, GemmParallelInfo* info = 0) { typedef const_blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; - typedef blas_data_mapper ResMapper; - LhsMapper lhs(_lhs,lhsStride); - RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride); + typedef blas_data_mapper ResMapper; + LhsMapper lhs(_lhs, lhsStride); + RhsMapper rhs(_rhs, rhsStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -83,8 +85,8 @@ static void run(Index rows, Index cols, Index depth, if(info) { // this is the parallel version! - Index tid = omp_get_thread_num(); - Index threads = omp_get_num_threads(); + int tid = omp_get_thread_num(); + int threads = omp_get_num_threads(); LhsScalar* blockA = blocking.blockA(); eigen_internal_assert(blockA!=0); @@ -116,9 +118,9 @@ static void run(Index rows, Index cols, Index depth, info[tid].sync = k; // Computes C_i += A' * B' per A'_i - for(Index shift=0; shift static void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) - lazyproduct::evalTo(dst, lhs, rhs); + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::assign_op()); else { dst.setZero(); @@ -440,7 +442,7 @@ struct generic_product_impl static void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) - lazyproduct::addTo(dst, lhs, rhs); + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::add_assign_op()); else scaleAndAddTo(dst,lhs, rhs, Scalar(1)); } @@ -449,7 +451,7 @@ struct generic_product_impl static void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs) { if((rhs.rows()+dst.rows()+dst.cols())<20 && rhs.rows()>0) - lazyproduct::subTo(dst, lhs, rhs); + lazyproduct::eval_dynamic(dst, lhs, rhs, internal::sub_assign_op()); else scaleAndAddTo(dst, lhs, rhs, Scalar(-1)); } @@ -476,7 +478,8 @@ struct generic_product_impl Index, LhsScalar, (ActualLhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(LhsBlasTraits::NeedToConjugate), RhsScalar, (ActualRhsTypeCleaned::Flags&RowMajorBit) ? RowMajor : ColMajor, bool(RhsBlasTraits::NeedToConjugate), - (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor>, + (Dest::Flags&RowMajorBit) ? RowMajor : ColMajor, + Dest::InnerStrideAtCompileTime>, ActualLhsTypeCleaned, ActualRhsTypeCleaned, Dest, BlockingType> GemmFunctor; BlockingType blocking(dst.rows(), dst.cols(), lhs.cols(), 1, true); diff --git a/eigenlib/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h b/eigenlib/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h index 5cd2794a..d68d2f96 100644 --- a/eigenlib/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h +++ b/eigenlib/Eigen/src/Core/products/GeneralMatrixMatrixTriangular.h @@ -25,51 +25,54 @@ namespace internal { **********************************************************************/ // forward declarations (defined at the end of this file) -template +template struct tribb_kernel; /* Optimized matrix-matrix product evaluating only one triangular half */ template + int ResStorageOrder, int ResInnerStride, int UpLo, int Version = Specialized> struct general_matrix_matrix_triangular_product; // as usual if the result is row major => we transpose the product template -struct general_matrix_matrix_triangular_product + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int UpLo, int Version> +struct general_matrix_matrix_triangular_product { typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* lhs, Index lhsStride, - const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resStride, + const RhsScalar* rhs, Index rhsStride, ResScalar* res, Index resIncr, Index resStride, const ResScalar& alpha, level3_blocking& blocking) { general_matrix_matrix_triangular_product - ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resStride,alpha,blocking); + ColMajor, ResInnerStride, UpLo==Lower?Upper:Lower> + ::run(size,depth,rhs,rhsStride,lhs,lhsStride,res,resIncr,resStride,alpha,blocking); } }; template -struct general_matrix_matrix_triangular_product + typename RhsScalar, int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int UpLo, int Version> +struct general_matrix_matrix_triangular_product { typedef typename ScalarBinaryOpTraits::ReturnType ResScalar; static EIGEN_STRONG_INLINE void run(Index size, Index depth,const LhsScalar* _lhs, Index lhsStride, - const RhsScalar* _rhs, Index rhsStride, ResScalar* _res, Index resStride, + const RhsScalar* _rhs, Index rhsStride, + ResScalar* _res, Index resIncr, Index resStride, const ResScalar& alpha, level3_blocking& blocking) { typedef gebp_traits Traits; typedef const_blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; - typedef blas_data_mapper ResMapper; + typedef blas_data_mapper ResMapper; LhsMapper lhs(_lhs,lhsStride); RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); Index mc = (std::min)(size,blocking.mc()); @@ -87,7 +90,7 @@ struct general_matrix_matrix_triangular_product pack_lhs; gemm_pack_rhs pack_rhs; gebp_kernel gebp; - tribb_kernel sybb; + tribb_kernel sybb; for(Index k2=0; k2 +template struct tribb_kernel { typedef gebp_traits Traits; @@ -142,13 +144,15 @@ struct tribb_kernel enum { BlockSize = meta_least_common_multiple::ret }; - void operator()(ResScalar* _res, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) + void operator()(ResScalar* _res, Index resIncr, Index resStride, const LhsScalar* blockA, const RhsScalar* blockB, Index size, Index depth, const ResScalar& alpha) { - typedef blas_data_mapper ResMapper; - ResMapper res(_res, resStride); - gebp_kernel gebp_kernel; + typedef blas_data_mapper ResMapper; + typedef blas_data_mapper BufferMapper; + ResMapper res(_res, resStride, resIncr); + gebp_kernel gebp_kernel1; + gebp_kernel gebp_kernel2; - Matrix buffer; + Matrix buffer((internal::constructor_without_unaligned_array_assert())); // let's process the block per panel of actual_mc x BlockSize, // again, each is split into three parts, etc. @@ -158,31 +162,32 @@ struct tribb_kernel const RhsScalar* actual_b = blockB+j*depth; if(UpLo==Upper) - gebp_kernel(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha, - -1, -1, 0, 0); - + gebp_kernel1(res.getSubMapper(0, j), blockA, actual_b, j, depth, actualBlockSize, alpha, + -1, -1, 0, 0); + // selfadjoint micro block { Index i = j; buffer.setZero(); // 1 - apply the kernel on the temporary buffer - gebp_kernel(ResMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, - -1, -1, 0, 0); + gebp_kernel2(BufferMapper(buffer.data(), BlockSize), blockA+depth*i, actual_b, actualBlockSize, depth, actualBlockSize, alpha, + -1, -1, 0, 0); + // 2 - triangular accumulation for(Index j1=0; j1 enum { IsRowMajor = (internal::traits::Flags&RowMajorBit) ? 1 : 0, LhsIsRowMajor = _ActualLhs::Flags&RowMajorBit ? 1 : 0, - RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0 + RhsIsRowMajor = _ActualRhs::Flags&RowMajorBit ? 1 : 0, + SkipDiag = (UpLo&(UnitDiag|ZeroDiag))!=0 }; Index size = mat.cols(); + if(SkipDiag) + size--; Index depth = actualLhs.cols(); typedef internal::gemm_blocking_space internal::general_matrix_matrix_triangular_product + IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo&(Lower|Upper)> ::run(size, depth, - &actualLhs.coeffRef(0,0), actualLhs.outerStride(), &actualRhs.coeffRef(0,0), actualRhs.outerStride(), - mat.data(), mat.outerStride(), actualAlpha, blocking); + &actualLhs.coeffRef(SkipDiag&&(UpLo&Lower)==Lower ? 1 : 0,0), actualLhs.outerStride(), + &actualRhs.coeffRef(0,SkipDiag&&(UpLo&Upper)==Upper ? 1 : 0), actualRhs.outerStride(), + mat.data() + (SkipDiag ? (bool(IsRowMajor) != ((UpLo&Lower)==Lower) ? mat.innerStride() : mat.outerStride() ) : 0), + mat.innerStride(), mat.outerStride(), actualAlpha, blocking); } }; @@ -294,6 +304,7 @@ template template TriangularView& TriangularViewImpl::_assignProduct(const ProductType& prod, const Scalar& alpha, bool beta) { + EIGEN_STATIC_ASSERT((UpLo&UnitDiag)==0, WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED); eigen_assert(derived().nestedExpression().rows() == prod.rows() && derived().cols() == prod.cols()); general_product_to_triangular_selector::InnerSize==1>::run(derived().nestedExpression().const_cast_derived(), prod, alpha, beta); diff --git a/eigenlib/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h b/eigenlib/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h index 5b7c15cc..691f95d6 100644 --- a/eigenlib/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h +++ b/eigenlib/Eigen/src/Core/products/GeneralMatrixMatrixTriangular_BLAS.h @@ -40,7 +40,7 @@ namespace internal { template struct general_matrix_matrix_rankupdate : general_matrix_matrix_triangular_product< - Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,UpLo,BuiltIn> {}; + Index,Scalar,AStorageOrder,ConjugateA,Scalar,AStorageOrder,ConjugateA,ResStorageOrder,1,UpLo,BuiltIn> {}; // try to go to BLAS specialization @@ -48,19 +48,19 @@ struct general_matrix_matrix_rankupdate : template \ struct general_matrix_matrix_triangular_product { \ + Scalar,RhsStorageOrder,ConjugateRhs,ColMajor,1,UpLo,Specialized> { \ static EIGEN_STRONG_INLINE void run(Index size, Index depth,const Scalar* lhs, Index lhsStride, \ - const Scalar* rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking& blocking) \ + const Scalar* rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking& blocking) \ { \ - if (lhs==rhs) { \ + if ( lhs==rhs && ((UpLo&(Lower|Upper))==UpLo) ) { \ general_matrix_matrix_rankupdate \ ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ } else { \ general_matrix_matrix_triangular_product \ - ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resStride,alpha,blocking); \ + ColMajor, 1, UpLo, BuiltIn> \ + ::run(size,depth,lhs,lhsStride,rhs,rhsStride,res,resIncr,resStride,alpha,blocking); \ } \ } \ }; @@ -88,7 +88,7 @@ struct general_matrix_matrix_rankupdate(lhsStride), ldc=convert_index(resStride), n=convert_index(size), k=convert_index(depth); \ char uplo=((IsLower) ? 'L' : 'U'), trans=((AStorageOrder==RowMajor) ? 'T':'N'); \ EIGTYPE beta(1); \ - BLASFUNC(&uplo, &trans, &n, &k, &numext::real_ref(alpha), lhs, &lda, &numext::real_ref(beta), res, &ldc); \ + BLASFUNC(&uplo, &trans, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), lhs, &lda, (const BLASTYPE*)&numext::real_ref(beta), res, &ldc); \ } \ }; @@ -125,9 +125,13 @@ struct general_matrix_matrix_rankupdate \ -struct general_matrix_matrix_product \ +struct general_matrix_matrix_product \ { \ typedef gebp_traits Traits; \ \ static void run(Index rows, Index cols, Index depth, \ const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ EIGTYPE alpha, \ level3_blocking& /*blocking*/, \ GemmParallelInfo* /*info = 0*/) \ { \ using std::conj; \ \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ char transa, transb; \ BlasIndex m, n, k, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ @@ -100,13 +102,20 @@ static void run(Index rows, Index cols, Index depth, \ ldb = convert_index(b_tmp.outerStride()); \ } else b = _rhs; \ \ - BLASPREFIX##gemm_(&transa, &transb, &m, &n, &k, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&transa, &transb, &m, &n, &k, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ }}; -GEMM_SPECIALIZATION(double, d, double, d) -GEMM_SPECIALIZATION(float, f, float, s) -GEMM_SPECIALIZATION(dcomplex, cd, double, z) -GEMM_SPECIALIZATION(scomplex, cf, float, c) +#ifdef EIGEN_USE_MKL +GEMM_SPECIALIZATION(double, d, double, dgemm) +GEMM_SPECIALIZATION(float, f, float, sgemm) +GEMM_SPECIALIZATION(dcomplex, cd, MKL_Complex16, zgemm) +GEMM_SPECIALIZATION(scomplex, cf, MKL_Complex8, cgemm) +#else +GEMM_SPECIALIZATION(double, d, double, dgemm_) +GEMM_SPECIALIZATION(float, f, float, sgemm_) +GEMM_SPECIALIZATION(dcomplex, cd, double, zgemm_) +GEMM_SPECIALIZATION(scomplex, cf, float, cgemm_) +#endif } // end namespase internal diff --git a/eigenlib/Eigen/src/Core/products/GeneralMatrixVector.h b/eigenlib/Eigen/src/Core/products/GeneralMatrixVector.h index 3c1a7fc4..a597c1f4 100644 --- a/eigenlib/Eigen/src/Core/products/GeneralMatrixVector.h +++ b/eigenlib/Eigen/src/Core/products/GeneralMatrixVector.h @@ -183,8 +183,8 @@ EIGEN_DONT_INLINE void general_matrix_vector_product \ struct general_matrix_vector_product_gemv \ { \ @@ -113,14 +113,21 @@ static void run( \ x_ptr=x_tmp.data(); \ incx=1; \ } else x_ptr=rhs; \ - BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, &numext::real_ref(beta), (BLASTYPE*)res, &incy); \ + BLASFUNC(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)lhs, &lda, (const BLASTYPE*)x_ptr, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &incy); \ }\ }; -EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, d) -EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, s) -EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, z) -EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, c) +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, MKL_Complex16, zgemv) +EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, MKL_Complex8 , cgemv) +#else +EIGEN_BLAS_GEMV_SPECIALIZATION(double, double, dgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(float, float, sgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(dcomplex, double, zgemv_) +EIGEN_BLAS_GEMV_SPECIALIZATION(scomplex, float, cgemv_) +#endif } // end namespase internal diff --git a/eigenlib/Eigen/src/Core/products/Parallelizer.h b/eigenlib/Eigen/src/Core/products/Parallelizer.h index 2a31e4cb..a3cc05b7 100644 --- a/eigenlib/Eigen/src/Core/products/Parallelizer.h +++ b/eigenlib/Eigen/src/Core/products/Parallelizer.h @@ -17,7 +17,8 @@ namespace internal { /** \internal */ inline void manage_multi_threading(Action action, int* v) { - static EIGEN_UNUSED int m_maxThreads = -1; + static int m_maxThreads = -1; + EIGEN_UNUSED_VARIABLE(m_maxThreads); if(action==SetAction) { @@ -75,7 +76,7 @@ template struct GemmParallelInfo { GemmParallelInfo() : sync(-1), users(0), lhs_start(0), lhs_length(0) {} - int volatile sync; + Index volatile sync; int volatile users; Index lhs_start; @@ -104,13 +105,14 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, // - the sizes are large enough // compute the maximal number of threads from the size of the product: - // FIXME this has to be fine tuned + // This first heuristic takes into account that the product kernel is fully optimized when working with nr columns at once. Index size = transpose ? rows : cols; - Index pb_max_threads = std::max(1,size / 32); + Index pb_max_threads = std::max(1,size / Functor::Traits::nr); + // compute the maximal number of threads from the total amount of work: double work = static_cast(rows) * static_cast(cols) * static_cast(depth); - double kMinTaskSize = 50000; // Heuristic. + double kMinTaskSize = 50000; // FIXME improve this heuristic. pb_max_threads = std::max(1, std::min(pb_max_threads, work / kMinTaskSize)); // compute the number of threads we are going to use @@ -149,8 +151,10 @@ void parallelize_gemm(const Functor& func, Index rows, Index cols, Index depth, info[i].lhs_start = r0; info[i].lhs_length = actualBlockRows; - if(transpose) func(c0, actualBlockCols, 0, rows, info); - else func(0, rows, c0, actualBlockCols, info); + if(transpose) + func(c0, actualBlockCols, 0, rows, info); + else + func(0, rows, c0, actualBlockCols, info); } #endif } diff --git a/eigenlib/Eigen/src/Core/products/SelfadjointMatrixMatrix.h b/eigenlib/Eigen/src/Core/products/SelfadjointMatrixMatrix.h index da6f82ab..04c93348 100644 --- a/eigenlib/Eigen/src/Core/products/SelfadjointMatrixMatrix.h +++ b/eigenlib/Eigen/src/Core/products/SelfadjointMatrixMatrix.h @@ -277,20 +277,21 @@ struct symm_pack_rhs template + int ResStorageOrder, int ResInnerStride> struct product_selfadjoint_matrix; template -struct product_selfadjoint_matrix + int RhsStorageOrder, bool RhsSelfAdjoint, bool ConjugateRhs, + int ResInnerStride> +struct product_selfadjoint_matrix { static EIGEN_STRONG_INLINE void run( Index rows, Index cols, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { product_selfadjoint_matrix::IsComplex && EIGEN_LOGICAL_XOR(RhsSelfAdjoint,ConjugateRhs), EIGEN_LOGICAL_XOR(LhsSelfAdjoint,LhsStorageOrder==RowMajor) ? ColMajor : RowMajor, LhsSelfAdjoint, NumTraits::IsComplex && EIGEN_LOGICAL_XOR(LhsSelfAdjoint,ConjugateLhs), - ColMajor> - ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking); + ColMajor,ResInnerStride> + ::run(cols, rows, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking); } }; template -struct product_selfadjoint_matrix + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +struct product_selfadjoint_matrix { static EIGEN_DONT_INLINE void run( Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking); }; template -EIGEN_DONT_INLINE void product_selfadjoint_matrix::run( + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +EIGEN_DONT_INLINE void product_selfadjoint_matrix::run( Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resStride, + Scalar* _res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { Index size = rows; @@ -334,11 +337,11 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix LhsMapper; typedef const_blas_data_mapper LhsTransposeMapper; typedef const_blas_data_mapper RhsMapper; - typedef blas_data_mapper ResMapper; + typedef blas_data_mapper ResMapper; LhsMapper lhs(_lhs,lhsStride); LhsTransposeMapper lhs_transpose(_lhs,lhsStride); RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -398,26 +401,28 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix -struct product_selfadjoint_matrix + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +struct product_selfadjoint_matrix { static EIGEN_DONT_INLINE void run( Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking); }; template -EIGEN_DONT_INLINE void product_selfadjoint_matrix::run( + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride> +EIGEN_DONT_INLINE void product_selfadjoint_matrix::run( Index rows, Index cols, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resStride, + Scalar* _res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { Index size = cols; @@ -425,9 +430,9 @@ EIGEN_DONT_INLINE void product_selfadjoint_matrix Traits; typedef const_blas_data_mapper LhsMapper; - typedef blas_data_mapper ResMapper; + typedef blas_data_mapper ResMapper; LhsMapper lhs(_lhs,lhsStride); - ResMapper res(_res,resStride); + ResMapper res(_res,resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -503,12 +508,13 @@ struct selfadjoint_product_impl NumTraits::IsComplex && EIGEN_LOGICAL_XOR(LhsIsUpper,bool(LhsBlasTraits::NeedToConjugate)), EIGEN_LOGICAL_XOR(RhsIsUpper,internal::traits::Flags &RowMajorBit) ? RowMajor : ColMajor, RhsIsSelfAdjoint, NumTraits::IsComplex && EIGEN_LOGICAL_XOR(RhsIsUpper,bool(RhsBlasTraits::NeedToConjugate)), - internal::traits::Flags&RowMajorBit ? RowMajor : ColMajor> + internal::traits::Flags&RowMajorBit ? RowMajor : ColMajor, + Dest::InnerStrideAtCompileTime> ::run( lhs.rows(), rhs.cols(), // sizes &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info - &dst.coeffRef(0,0), dst.outerStride(), // result info + &dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(), // result info actualAlpha, blocking // alpha ); } diff --git a/eigenlib/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h b/eigenlib/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h index a45238d6..61396dbd 100644 --- a/eigenlib/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h +++ b/eigenlib/Eigen/src/Core/products/SelfadjointMatrixMatrix_BLAS.h @@ -40,20 +40,22 @@ namespace internal { /* Optimized selfadjoint matrix * matrix (?SYMM/?HEMM) product */ -#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_SYMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ -struct product_selfadjoint_matrix \ +struct product_selfadjoint_matrix \ {\ \ static void run( \ Index rows, Index cols, \ const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ EIGTYPE alpha, level3_blocking& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ char side='L', uplo='L'; \ BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ @@ -81,25 +83,27 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } else b = _rhs; \ \ - BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_HEMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ -struct product_selfadjoint_matrix \ +struct product_selfadjoint_matrix \ {\ static void run( \ Index rows, Index cols, \ const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ EIGTYPE alpha, level3_blocking& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ char side='L', uplo='L'; \ BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ @@ -144,33 +148,41 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } \ \ - BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -EIGEN_BLAS_SYMM_L(double, double, d, d) -EIGEN_BLAS_SYMM_L(float, float, f, s) -EIGEN_BLAS_HEMM_L(dcomplex, double, cd, z) -EIGEN_BLAS_HEMM_L(scomplex, float, cf, c) - +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMM_L(double, double, d, dsymm) +EIGEN_BLAS_SYMM_L(float, float, f, ssymm) +EIGEN_BLAS_HEMM_L(dcomplex, MKL_Complex16, cd, zhemm) +EIGEN_BLAS_HEMM_L(scomplex, MKL_Complex8, cf, chemm) +#else +EIGEN_BLAS_SYMM_L(double, double, d, dsymm_) +EIGEN_BLAS_SYMM_L(float, float, f, ssymm_) +EIGEN_BLAS_HEMM_L(dcomplex, double, cd, zhemm_) +EIGEN_BLAS_HEMM_L(scomplex, float, cf, chemm_) +#endif /* Optimized matrix * selfadjoint matrix (?SYMM/?HEMM) product */ -#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_SYMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ -struct product_selfadjoint_matrix \ +struct product_selfadjoint_matrix \ {\ \ static void run( \ Index rows, Index cols, \ const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ EIGTYPE alpha, level3_blocking& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ char side='R', uplo='L'; \ BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ @@ -197,25 +209,27 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } else b = _lhs; \ \ - BLASPREFIX##symm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ \ } \ }; -#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_HEMM_R(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ -struct product_selfadjoint_matrix \ +struct product_selfadjoint_matrix \ {\ static void run( \ Index rows, Index cols, \ const EIGTYPE* _lhs, Index lhsStride, \ const EIGTYPE* _rhs, Index rhsStride, \ - EIGTYPE* res, Index resStride, \ + EIGTYPE* res, Index resIncr, Index resStride, \ EIGTYPE alpha, level3_blocking& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ char side='R', uplo='L'; \ BlasIndex m, n, lda, ldb, ldc; \ const EIGTYPE *a, *b; \ @@ -259,15 +273,21 @@ struct product_selfadjoint_matrix(b_tmp.outerStride()); \ } \ \ - BLASPREFIX##hemm_(&side, &uplo, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, &numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ + BLASFUNC(&side, &uplo, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)b, &ldb, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)res, &ldc); \ } \ }; -EIGEN_BLAS_SYMM_R(double, double, d, d) -EIGEN_BLAS_SYMM_R(float, float, f, s) -EIGEN_BLAS_HEMM_R(dcomplex, double, cd, z) -EIGEN_BLAS_HEMM_R(scomplex, float, cf, c) - +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_SYMM_R(double, double, d, dsymm) +EIGEN_BLAS_SYMM_R(float, float, f, ssymm) +EIGEN_BLAS_HEMM_R(dcomplex, MKL_Complex16, cd, zhemm) +EIGEN_BLAS_HEMM_R(scomplex, MKL_Complex8, cf, chemm) +#else +EIGEN_BLAS_SYMM_R(double, double, d, dsymm_) +EIGEN_BLAS_SYMM_R(float, float, f, ssymm_) +EIGEN_BLAS_HEMM_R(dcomplex, double, cd, zhemm_) +EIGEN_BLAS_HEMM_R(scomplex, float, cf, chemm_) +#endif } // end namespace internal } // end namespace Eigen diff --git a/eigenlib/Eigen/src/Core/products/SelfadjointMatrixVector.h b/eigenlib/Eigen/src/Core/products/SelfadjointMatrixVector.h index d97f8caa..3fd180e6 100644 --- a/eigenlib/Eigen/src/Core/products/SelfadjointMatrixVector.h +++ b/eigenlib/Eigen/src/Core/products/SelfadjointMatrixVector.h @@ -83,10 +83,10 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product(t3); - size_t starti = FirstTriangular ? 0 : j+2; - size_t endi = FirstTriangular ? j : size; - size_t alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi-starti); - size_t alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize); + Index starti = FirstTriangular ? 0 : j+2; + Index endi = FirstTriangular ? j : size; + Index alignedStart = (starti) + internal::first_default_aligned(&res[starti], endi-starti); + Index alignedEnd = alignedStart + ((endi-alignedStart)/(PacketSize))*(PacketSize); res[j] += cjd.pmul(numext::real(A0[j]), t0); res[j+1] += cjd.pmul(numext::real(A1[j+1]), t1); @@ -101,7 +101,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product(a0It); a0It += PacketSize; Packet A1i = ploadu(a1It); a1It += PacketSize; @@ -125,7 +125,7 @@ EIGEN_DONT_INLINE void selfadjoint_matrix_vector_product internal::general_matrix_matrix_triangular_product::IsComplex, Scalar, OtherIsRowMajor ? ColMajor : RowMajor, (!OtherBlasTraits::NeedToConjugate) && NumTraits::IsComplex, - IsRowMajor ? RowMajor : ColMajor, UpLo> + IsRowMajor ? RowMajor : ColMajor, MatrixType::InnerStrideAtCompileTime, UpLo> ::run(size, depth, &actualOther.coeffRef(0,0), actualOther.outerStride(), &actualOther.coeffRef(0,0), actualOther.outerStride(), - mat.data(), mat.outerStride(), actualAlpha, blocking); + mat.data(), mat.innerStride(), mat.outerStride(), actualAlpha, blocking); } }; diff --git a/eigenlib/Eigen/src/Core/products/TriangularMatrixMatrix.h b/eigenlib/Eigen/src/Core/products/TriangularMatrixMatrix.h index 8a2f7cd7..2fb408d1 100644 --- a/eigenlib/Eigen/src/Core/products/TriangularMatrixMatrix.h +++ b/eigenlib/Eigen/src/Core/products/TriangularMatrixMatrix.h @@ -45,22 +45,24 @@ template + int ResStorageOrder, int ResInnerStride, + int Version = Specialized> struct product_triangular_matrix_matrix; template + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> struct product_triangular_matrix_matrix + RhsStorageOrder,ConjugateRhs,RowMajor,ResInnerStride,Version> { static EIGEN_STRONG_INLINE void run( Index rows, Index cols, Index depth, const Scalar* lhs, Index lhsStride, const Scalar* rhs, Index rhsStride, - Scalar* res, Index resStride, + Scalar* res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { product_triangular_matrix_matrix - ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resStride, alpha, blocking); + ColMajor, ResInnerStride> + ::run(cols, rows, depth, rhs, rhsStride, lhs, lhsStride, res, resIncr, resStride, alpha, blocking); } }; // implements col-major += alpha * op(triangular) * op(general) template + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> struct product_triangular_matrix_matrix + RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version> { typedef gebp_traits Traits; @@ -95,20 +98,21 @@ struct product_triangular_matrix_matrix& blocking); }; template + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> EIGEN_DONT_INLINE void product_triangular_matrix_matrix::run( + RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run( Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resStride, + Scalar* _res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { // strip zeros @@ -119,10 +123,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix LhsMapper; typedef const_blas_data_mapper RhsMapper; - typedef blas_data_mapper ResMapper; + typedef blas_data_mapper ResMapper; LhsMapper lhs(_lhs,lhsStride); RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -137,7 +141,13 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix triangularBuffer; + // To work around an "error: member reference base type 'Matrix<...> + // (Eigen::internal::constructor_without_unaligned_array_assert (*)())' is + // not a structure or union" compilation error in nvcc (tested V8.0.61), + // create a dummy internal::constructor_without_unaligned_array_assert + // object to pass to the Matrix constructor. + internal::constructor_without_unaligned_array_assert a; + Matrix triangularBuffer(a); triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); @@ -229,10 +239,11 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> struct product_triangular_matrix_matrix + RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version> { typedef gebp_traits Traits; enum { @@ -245,20 +256,21 @@ struct product_triangular_matrix_matrix& blocking); }; template + int RhsStorageOrder, bool ConjugateRhs, + int ResInnerStride, int Version> EIGEN_DONT_INLINE void product_triangular_matrix_matrix::run( + RhsStorageOrder,ConjugateRhs,ColMajor,ResInnerStride,Version>::run( Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride, const Scalar* _rhs, Index rhsStride, - Scalar* _res, Index resStride, + Scalar* _res, Index resIncr, Index resStride, const Scalar& alpha, level3_blocking& blocking) { const Index PacketBytes = packet_traits::size*sizeof(Scalar); @@ -270,10 +282,10 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix LhsMapper; typedef const_blas_data_mapper RhsMapper; - typedef blas_data_mapper ResMapper; + typedef blas_data_mapper ResMapper; LhsMapper lhs(_lhs,lhsStride); RhsMapper rhs(_rhs,rhsStride); - ResMapper res(_res, resStride); + ResMapper res(_res, resStride, resIncr); Index kc = blocking.kc(); // cache block size along the K direction Index mc = (std::min)(rows,blocking.mc()); // cache block size along the M direction @@ -284,7 +296,8 @@ EIGEN_DONT_INLINE void product_triangular_matrix_matrix triangularBuffer; + internal::constructor_without_unaligned_array_assert a; + Matrix triangularBuffer(a); triangularBuffer.setZero(); if((Mode&ZeroDiag)==ZeroDiag) triangularBuffer.diagonal().setZero(); @@ -393,7 +406,9 @@ struct triangular_product_impl { template static void run(Dest& dst, const Lhs &a_lhs, const Rhs &a_rhs, const typename Dest::Scalar& alpha) { - typedef typename Dest::Scalar Scalar; + typedef typename Lhs::Scalar LhsScalar; + typedef typename Rhs::Scalar RhsScalar; + typedef typename Dest::Scalar Scalar; typedef internal::blas_traits LhsBlasTraits; typedef typename LhsBlasTraits::DirectLinearAccessType ActualLhsType; @@ -405,8 +420,9 @@ struct triangular_product_impl typename internal::add_const_on_value_type::type lhs = LhsBlasTraits::extract(a_lhs); typename internal::add_const_on_value_type::type rhs = RhsBlasTraits::extract(a_rhs); - Scalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(a_lhs) - * RhsBlasTraits::extractScalarFactor(a_rhs); + LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(a_lhs); + RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(a_rhs); + Scalar actualAlpha = alpha * lhs_alpha * rhs_alpha; typedef internal::gemm_blocking_space<(Dest::Flags&RowMajorBit) ? RowMajor : ColMajor,Scalar,Scalar, Lhs::MaxRowsAtCompileTime, Rhs::MaxColsAtCompileTime, Lhs::MaxColsAtCompileTime,4> BlockingType; @@ -423,14 +439,29 @@ struct triangular_product_impl Mode, LhsIsTriangular, (internal::traits::Flags&RowMajorBit) ? RowMajor : ColMajor, LhsBlasTraits::NeedToConjugate, (internal::traits::Flags&RowMajorBit) ? RowMajor : ColMajor, RhsBlasTraits::NeedToConjugate, - (internal::traits::Flags&RowMajorBit) ? RowMajor : ColMajor> + (internal::traits::Flags&RowMajorBit) ? RowMajor : ColMajor, Dest::InnerStrideAtCompileTime> ::run( stripedRows, stripedCols, stripedDepth, // sizes &lhs.coeffRef(0,0), lhs.outerStride(), // lhs info &rhs.coeffRef(0,0), rhs.outerStride(), // rhs info - &dst.coeffRef(0,0), dst.outerStride(), // result info + &dst.coeffRef(0,0), dst.innerStride(), dst.outerStride(), // result info actualAlpha, blocking ); + + // Apply correction if the diagonal is unit and a scalar factor was nested: + if ((Mode&UnitDiag)==UnitDiag) + { + if (LhsIsTriangular && lhs_alpha!=LhsScalar(1)) + { + Index diagSize = (std::min)(lhs.rows(),lhs.cols()); + dst.topRows(diagSize) -= ((lhs_alpha-LhsScalar(1))*a_rhs).topRows(diagSize); + } + else if ((!LhsIsTriangular) && rhs_alpha!=RhsScalar(1)) + { + Index diagSize = (std::min)(rhs.rows(),rhs.cols()); + dst.leftCols(diagSize) -= (rhs_alpha-RhsScalar(1))*a_lhs.leftCols(diagSize); + } + } } }; diff --git a/eigenlib/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h b/eigenlib/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h index aecded6b..a98d12e4 100644 --- a/eigenlib/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h +++ b/eigenlib/Eigen/src/Core/products/TriangularMatrixMatrix_BLAS.h @@ -46,7 +46,7 @@ template {}; + RhsStorageOrder, ConjugateRhs, ResStorageOrder, 1, BuiltIn> {}; // try to go to BLAS specialization @@ -55,13 +55,15 @@ template \ struct product_triangular_matrix_matrix { \ + LhsStorageOrder,ConjugateLhs, RhsStorageOrder,ConjugateRhs,ColMajor,1,Specialized> { \ static inline void run(Index _rows, Index _cols, Index _depth, const Scalar* _lhs, Index lhsStride,\ - const Scalar* _rhs, Index rhsStride, Scalar* res, Index resStride, Scalar alpha, level3_blocking& blocking) { \ + const Scalar* _rhs, Index rhsStride, Scalar* res, Index resIncr, Index resStride, Scalar alpha, level3_blocking& blocking) { \ + EIGEN_ONLY_USED_FOR_DEBUG(resIncr); \ + eigen_assert(resIncr == 1); \ product_triangular_matrix_matrix_trmm::run( \ - _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ + _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ } \ }; @@ -75,7 +77,7 @@ EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, true) EIGEN_BLAS_TRMM_SPECIALIZE(scomplex, false) // implements col-major += alpha * op(triangular) * op(general) -#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_TRMM_L(EIGTYPE, BLASTYPE, EIGPREFIX, BLASFUNC) \ template \ @@ -115,8 +117,8 @@ struct product_triangular_matrix_matrix_trmm::run( \ - _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ + LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \ + _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \ /*std::cout << "TRMM_L: A is not square! Go to Eigen TRMM implementation!\n";*/ \ } else { \ /* Make sense to call GEMM */ \ @@ -124,8 +126,8 @@ struct product_triangular_matrix_matrix_trmm(); \ BlasIndex aStride = convert_index(aa_tmp.outerStride()); \ gemm_blocking_space gemm_blocking(_rows,_cols,_depth, 1, true); \ - general_matrix_matrix_product::run( \ - rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, resStride, alpha, gemm_blocking, 0); \ + general_matrix_matrix_product::run( \ + rows, cols, depth, aa_tmp.data(), aStride, _rhs, rhsStride, res, 1, resStride, alpha, gemm_blocking, 0); \ \ /*std::cout << "TRMM_L: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \ } \ @@ -172,7 +174,7 @@ struct product_triangular_matrix_matrix_trmm > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -180,13 +182,20 @@ struct product_triangular_matrix_matrix_trmm \ @@ -225,8 +234,8 @@ struct product_triangular_matrix_matrix_trmm::run( \ - _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, resStride, alpha, blocking); \ + LhsStorageOrder,ConjugateLhs, RhsStorageOrder, ConjugateRhs, ColMajor, 1, BuiltIn>::run( \ + _rows, _cols, _depth, _lhs, lhsStride, _rhs, rhsStride, res, 1, resStride, alpha, blocking); \ /*std::cout << "TRMM_R: A is not square! Go to Eigen TRMM implementation!\n";*/ \ } else { \ /* Make sense to call GEMM */ \ @@ -234,8 +243,8 @@ struct product_triangular_matrix_matrix_trmm(); \ BlasIndex aStride = convert_index(aa_tmp.outerStride()); \ gemm_blocking_space gemm_blocking(_rows,_cols,_depth, 1, true); \ - general_matrix_matrix_product::run( \ - rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, resStride, alpha, gemm_blocking, 0); \ + general_matrix_matrix_product::run( \ + rows, cols, depth, _lhs, lhsStride, aa_tmp.data(), aStride, res, 1, resStride, alpha, gemm_blocking, 0); \ \ /*std::cout << "TRMM_R: A is not square! Go to BLAS GEMM implementation! " << nthr<<" \n";*/ \ } \ @@ -282,7 +291,7 @@ struct product_triangular_matrix_matrix_trmm > res_tmp(res,rows,cols,OuterStride<>(resStride)); \ @@ -290,11 +299,17 @@ struct product_triangular_matrix_matrix_trmm struct trmv_selector typename internal::add_const_on_value_type::type actualLhs = LhsBlasTraits::extract(lhs); typename internal::add_const_on_value_type::type actualRhs = RhsBlasTraits::extract(rhs); - ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) - * RhsBlasTraits::extractScalarFactor(rhs); + LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs); + RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs); + ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha; enum { // FIXME find a way to allow an inner stride on the result if packet_traits::size==1 @@ -274,6 +275,12 @@ template struct trmv_selector else dest = MappedDest(actualDestPtr, dest.size()); } + + if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) ) + { + Index diagSize = (std::min)(lhs.rows(),lhs.cols()); + dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize); + } } }; @@ -295,8 +302,9 @@ template struct trmv_selector typename add_const::type actualLhs = LhsBlasTraits::extract(lhs); typename add_const::type actualRhs = RhsBlasTraits::extract(rhs); - ResScalar actualAlpha = alpha * LhsBlasTraits::extractScalarFactor(lhs) - * RhsBlasTraits::extractScalarFactor(rhs); + LhsScalar lhs_alpha = LhsBlasTraits::extractScalarFactor(lhs); + RhsScalar rhs_alpha = RhsBlasTraits::extractScalarFactor(rhs); + ResScalar actualAlpha = alpha * lhs_alpha * rhs_alpha; enum { DirectlyUseRhs = ActualRhsTypeCleaned::InnerStrideAtCompileTime==1 @@ -326,6 +334,12 @@ template struct trmv_selector actualRhsPtr,1, dest.data(),dest.innerStride(), actualAlpha); + + if ( ((Mode&UnitDiag)==UnitDiag) && (lhs_alpha!=LhsScalar(1)) ) + { + Index diagSize = (std::min)(lhs.rows(),lhs.cols()); + dest.head(diagSize) -= (lhs_alpha-LhsScalar(1))*rhs.head(diagSize); + } } }; diff --git a/eigenlib/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h b/eigenlib/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h index 07bf26ce..3d47a2b9 100644 --- a/eigenlib/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h +++ b/eigenlib/Eigen/src/Core/products/TriangularMatrixVector_BLAS.h @@ -71,7 +71,7 @@ EIGEN_BLAS_TRMV_SPECIALIZE(dcomplex) EIGEN_BLAS_TRMV_SPECIALIZE(scomplex) // implements col-major: res += alpha * op(triangular) * vector -#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_TRMV_CM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \ template \ struct triangular_matrix_vector_product_trmv { \ enum { \ @@ -121,10 +121,10 @@ struct triangular_matrix_vector_product_trmv(size); \ n = convert_index(cols-size); \ } \ - BLASPREFIX##gemv_(&trans, &m, &n, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ + BLASPREFIX##gemv##BLASPOSTFIX(&trans, &m, &n, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \ } \ } \ }; -EIGEN_BLAS_TRMV_CM(double, double, d, d) -EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z) -EIGEN_BLAS_TRMV_CM(float, float, f, s) -EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c) +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRMV_CM(double, double, d, d,) +EIGEN_BLAS_TRMV_CM(dcomplex, MKL_Complex16, cd, z,) +EIGEN_BLAS_TRMV_CM(float, float, f, s,) +EIGEN_BLAS_TRMV_CM(scomplex, MKL_Complex8, cf, c,) +#else +EIGEN_BLAS_TRMV_CM(double, double, d, d, _) +EIGEN_BLAS_TRMV_CM(dcomplex, double, cd, z, _) +EIGEN_BLAS_TRMV_CM(float, float, f, s, _) +EIGEN_BLAS_TRMV_CM(scomplex, float, cf, c, _) +#endif // implements row-major: res += alpha * op(triangular) * vector -#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX) \ +#define EIGEN_BLAS_TRMV_RM(EIGTYPE, BLASTYPE, EIGPREFIX, BLASPREFIX, BLASPOSTFIX) \ template \ struct triangular_matrix_vector_product_trmv { \ enum { \ @@ -203,10 +210,10 @@ struct triangular_matrix_vector_product_trmv(size); \ n = convert_index(cols-size); \ } \ - BLASPREFIX##gemv_(&trans, &n, &m, &numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, &numext::real_ref(beta), (BLASTYPE*)y, &incy); \ + BLASPREFIX##gemv##BLASPOSTFIX(&trans, &n, &m, (const BLASTYPE*)&numext::real_ref(alpha), (const BLASTYPE*)a, &lda, (const BLASTYPE*)x, &incx, (const BLASTYPE*)&numext::real_ref(beta), (BLASTYPE*)y, &incy); \ } \ } \ }; -EIGEN_BLAS_TRMV_RM(double, double, d, d) -EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z) -EIGEN_BLAS_TRMV_RM(float, float, f, s) -EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c) +#ifdef EIGEN_USE_MKL +EIGEN_BLAS_TRMV_RM(double, double, d, d,) +EIGEN_BLAS_TRMV_RM(dcomplex, MKL_Complex16, cd, z,) +EIGEN_BLAS_TRMV_RM(float, float, f, s,) +EIGEN_BLAS_TRMV_RM(scomplex, MKL_Complex8, cf, c,) +#else +EIGEN_BLAS_TRMV_RM(double, double, d, d,_) +EIGEN_BLAS_TRMV_RM(dcomplex, double, cd, z,_) +EIGEN_BLAS_TRMV_RM(float, float, f, s,_) +EIGEN_BLAS_TRMV_RM(scomplex, float, cf, c,_) +#endif } // end namespase internal diff --git a/eigenlib/Eigen/src/Core/products/TriangularSolverMatrix.h b/eigenlib/Eigen/src/Core/products/TriangularSolverMatrix.h index 223c38b8..e3ed2cd1 100644 --- a/eigenlib/Eigen/src/Core/products/TriangularSolverMatrix.h +++ b/eigenlib/Eigen/src/Core/products/TriangularSolverMatrix.h @@ -15,48 +15,48 @@ namespace Eigen { namespace internal { // if the rhs is row major, let's transpose the product -template -struct triangular_solve_matrix +template +struct triangular_solve_matrix { static void run( Index size, Index cols, const Scalar* tri, Index triStride, - Scalar* _other, Index otherStride, + Scalar* _other, Index otherIncr, Index otherStride, level3_blocking& blocking) { triangular_solve_matrix< Scalar, Index, Side==OnTheLeft?OnTheRight:OnTheLeft, (Mode&UnitDiag) | ((Mode&Upper) ? Lower : Upper), NumTraits::IsComplex && Conjugate, - TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor> - ::run(size, cols, tri, triStride, _other, otherStride, blocking); + TriStorageOrder==RowMajor ? ColMajor : RowMajor, ColMajor, OtherInnerStride> + ::run(size, cols, tri, triStride, _other, otherIncr, otherStride, blocking); } }; /* Optimized triangular solver with multiple right hand side and the triangular matrix on the left */ -template -struct triangular_solve_matrix +template +struct triangular_solve_matrix { static EIGEN_DONT_INLINE void run( Index size, Index otherSize, const Scalar* _tri, Index triStride, - Scalar* _other, Index otherStride, + Scalar* _other, Index otherIncr, Index otherStride, level3_blocking& blocking); }; -template -EIGEN_DONT_INLINE void triangular_solve_matrix::run( +template +EIGEN_DONT_INLINE void triangular_solve_matrix::run( Index size, Index otherSize, const Scalar* _tri, Index triStride, - Scalar* _other, Index otherStride, + Scalar* _other, Index otherIncr, Index otherStride, level3_blocking& blocking) { Index cols = otherSize; typedef const_blas_data_mapper TriMapper; - typedef blas_data_mapper OtherMapper; + typedef blas_data_mapper OtherMapper; TriMapper tri(_tri, triStride); - OtherMapper other(_other, otherStride); + OtherMapper other(_other, otherStride, otherIncr); typedef gebp_traits Traits; @@ -128,19 +128,19 @@ EIGEN_DONT_INLINE void triangular_solve_matrix -struct triangular_solve_matrix +template +struct triangular_solve_matrix { static EIGEN_DONT_INLINE void run( Index size, Index otherSize, const Scalar* _tri, Index triStride, - Scalar* _other, Index otherStride, + Scalar* _other, Index otherIncr, Index otherStride, level3_blocking& blocking); }; -template -EIGEN_DONT_INLINE void triangular_solve_matrix::run( +template +EIGEN_DONT_INLINE void triangular_solve_matrix::run( Index size, Index otherSize, const Scalar* _tri, Index triStride, - Scalar* _other, Index otherStride, + Scalar* _other, Index otherIncr, Index otherStride, level3_blocking& blocking) { Index rows = otherSize; typedef typename NumTraits::Real RealScalar; - typedef blas_data_mapper LhsMapper; + typedef blas_data_mapper LhsMapper; typedef const_blas_data_mapper RhsMapper; - LhsMapper lhs(_other, otherStride); + LhsMapper lhs(_other, otherStride, otherIncr); RhsMapper rhs(_tri, triStride); typedef gebp_traits Traits; @@ -297,24 +297,24 @@ EIGEN_DONT_INLINE void triangular_solve_matrix \ -struct triangular_solve_matrix \ +struct triangular_solve_matrix \ { \ enum { \ IsLower = (Mode&Lower) == Lower, \ @@ -51,8 +51,10 @@ struct triangular_solve_matrix& /*blocking*/) \ + EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \ + eigen_assert(otherIncr == 1); \ BlasIndex m = convert_index(size), n = convert_index(otherSize), lda, ldb; \ char side = 'L', uplo, diag='N', transa; \ /* Set alpha_ */ \ @@ -80,20 +82,26 @@ struct triangular_solve_matrix \ -struct triangular_solve_matrix \ +struct triangular_solve_matrix \ { \ enum { \ IsLower = (Mode&Lower) == Lower, \ @@ -104,8 +112,10 @@ struct triangular_solve_matrix& /*blocking*/) \ + EIGTYPE* _other, Index otherIncr, Index otherStride, level3_blocking& /*blocking*/) \ { \ + EIGEN_ONLY_USED_FOR_DEBUG(otherIncr); \ + eigen_assert(otherIncr == 1); \ BlasIndex m = convert_index(otherSize), n = convert_index(size), lda, ldb; \ char side = 'R', uplo, diag='N', transa; \ /* Set alpha_ */ \ @@ -133,16 +143,22 @@ struct triangular_solve_matrix + int ResStorageOrder, int ResInnerStride> struct general_matrix_matrix_product; template +class BlasLinearMapper; + template -class BlasLinearMapper { +class BlasLinearMapper { public: typedef typename packet_traits::type Packet; typedef typename packet_traits::half HalfPacket; - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data) : m_data(data) {} + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data, Index incr=1) + : m_data(data) + { + EIGEN_ONLY_USED_FOR_DEBUG(incr); + eigen_assert(incr==1); + } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { internal::prefetch(&operator()(i)); @@ -188,16 +196,25 @@ class BlasLinearMapper { }; // Lightweight helper class to access matrix coefficients. -template -class blas_data_mapper { - public: +template +class blas_data_mapper; + +template +class blas_data_mapper +{ +public: typedef typename packet_traits::type Packet; typedef typename packet_traits::half HalfPacket; typedef BlasLinearMapper LinearMapper; typedef BlasVectorMapper VectorMapper; - EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride) : m_data(data), m_stride(stride) {} + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr=1) + : m_data(data), m_stride(stride) + { + EIGEN_ONLY_USED_FOR_DEBUG(incr); + eigen_assert(incr==1); + } EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper getSubMapper(Index i, Index j) const { @@ -251,6 +268,90 @@ class blas_data_mapper { const Index m_stride; }; +// Implementation of non-natural increment (i.e. inner-stride != 1) +// The exposed API is not complete yet compared to the Incr==1 case +// because some features makes less sense in this case. +template +class BlasLinearMapper +{ +public: + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE BlasLinearMapper(Scalar *data,Index incr) : m_data(data), m_incr(incr) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void prefetch(int i) const { + internal::prefetch(&operator()(i)); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar& operator()(Index i) const { + return m_data[i*m_incr.value()]; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i) const { + return pgather(m_data + i*m_incr.value(), m_incr.value()); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void storePacket(Index i, const PacketType &p) const { + pscatter(m_data + i*m_incr.value(), p, m_incr.value()); + } + +protected: + Scalar *m_data; + const internal::variable_if_dynamic m_incr; +}; + +template +class blas_data_mapper +{ +public: + typedef typename packet_traits::type Packet; + typedef typename packet_traits::half HalfPacket; + + typedef BlasLinearMapper LinearMapper; + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper(Scalar* data, Index stride, Index incr) : m_data(data), m_stride(stride), m_incr(incr) {} + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE blas_data_mapper + getSubMapper(Index i, Index j) const { + return blas_data_mapper(&operator()(i, j), m_stride, m_incr.value()); + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LinearMapper getLinearMapper(Index i, Index j) const { + return LinearMapper(&operator()(i, j), m_incr.value()); + } + + EIGEN_DEVICE_FUNC + EIGEN_ALWAYS_INLINE Scalar& operator()(Index i, Index j) const { + return m_data[StorageOrder==RowMajor ? j*m_incr.value() + i*m_stride : i*m_incr.value() + j*m_stride]; + } + + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Packet loadPacket(Index i, Index j) const { + return pgather(&operator()(i, j),m_incr.value()); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE PacketT load(Index i, Index j) const { + return pgather(&operator()(i, j),m_incr.value()); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void scatterPacket(Index i, Index j, const SubPacket &p) const { + pscatter(&operator()(i, j), p, m_stride); + } + + template + EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE SubPacket gatherPacket(Index i, Index j) const { + return pgather(&operator()(i, j), m_stride); + } + +protected: + Scalar* EIGEN_RESTRICT m_data; + const Index m_stride; + const internal::variable_if_dynamic m_incr; +}; + // lightweight helper class to access matrix coefficients (const version) template class const_blas_data_mapper : public blas_data_mapper { diff --git a/eigenlib/Eigen/src/Core/util/DisableStupidWarnings.h b/eigenlib/Eigen/src/Core/util/DisableStupidWarnings.h index 7559e129..74f74cc4 100755 --- a/eigenlib/Eigen/src/Core/util/DisableStupidWarnings.h +++ b/eigenlib/Eigen/src/Core/util/DisableStupidWarnings.h @@ -43,13 +43,24 @@ #endif #pragma clang diagnostic ignored "-Wconstant-logical-operand" -#elif defined __GNUC__ && __GNUC__>=6 +#elif defined __GNUC__ - #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS + #if (!defined(EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS)) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) #pragma GCC diagnostic push #endif - #pragma GCC diagnostic ignored "-Wignored-attributes" - + // g++ warns about local variables shadowing member functions, which is too strict + #pragma GCC diagnostic ignored "-Wshadow" + #if __GNUC__ == 4 && __GNUC_MINOR__ < 8 + // Until g++-4.7 there are warnings when comparing unsigned int vs 0, even in templated functions: + #pragma GCC diagnostic ignored "-Wtype-limits" + #endif + #if __GNUC__>=6 + #pragma GCC diagnostic ignored "-Wignored-attributes" + #endif + #if __GNUC__==7 + // See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=89325 + #pragma GCC diagnostic ignored "-Wattributes" + #endif #endif #if defined __NVCC__ @@ -72,4 +83,12 @@ #pragma diag_suppress 2737 #endif +#else +// warnings already disabled: +# ifndef EIGEN_WARNINGS_DISABLED_2 +# define EIGEN_WARNINGS_DISABLED_2 +# elif defined(EIGEN_INTERNAL_DEBUGGING) +# error "Do not include \"DisableStupidWarnings.h\" recursively more than twice!" +# endif + #endif // not EIGEN_WARNINGS_DISABLED diff --git a/eigenlib/Eigen/src/Core/util/ForwardDeclarations.h b/eigenlib/Eigen/src/Core/util/ForwardDeclarations.h index ea107393..134544f9 100644 --- a/eigenlib/Eigen/src/Core/util/ForwardDeclarations.h +++ b/eigenlib/Eigen/src/Core/util/ForwardDeclarations.h @@ -47,11 +47,7 @@ template struct NumTraits; template struct EigenBase; template class DenseBase; template class PlainObjectBase; - - -template::value > -class DenseCoeffsBase; +template class DenseCoeffsBase; template /*Check IMKL version for compatibility: < 10.3 is not usable with Eigen*/ @@ -108,6 +109,10 @@ #endif #endif +#if defined(EIGEN_USE_BLAS) && !defined(EIGEN_USE_MKL) +#include "../../misc/blas.h" +#endif + namespace Eigen { typedef std::complex dcomplex; @@ -121,8 +126,5 @@ typedef int BlasIndex; } // end namespace Eigen -#if defined(EIGEN_USE_BLAS) -#include "../../misc/blas.h" -#endif #endif // EIGEN_MKL_SUPPORT_H diff --git a/eigenlib/Eigen/src/Core/util/Macros.h b/eigenlib/Eigen/src/Core/util/Macros.h index ace330b3..6b0399eb 100644 --- a/eigenlib/Eigen/src/Core/util/Macros.h +++ b/eigenlib/Eigen/src/Core/util/Macros.h @@ -13,7 +13,7 @@ #define EIGEN_WORLD_VERSION 3 #define EIGEN_MAJOR_VERSION 3 -#define EIGEN_MINOR_VERSION 2 +#define EIGEN_MINOR_VERSION 9 #define EIGEN_VERSION_AT_LEAST(x,y,z) (EIGEN_WORLD_VERSION>x || (EIGEN_WORLD_VERSION>=x && \ (EIGEN_MAJOR_VERSION>y || (EIGEN_MAJOR_VERSION>=y && \ @@ -80,8 +80,8 @@ // 2015 14 1900 // "15" 15 1900 -/// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC -#if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC) +/// \internal EIGEN_COMP_MSVC_STRICT set to 1 if the compiler is really Microsoft Visual C++ and not ,e.g., ICC or clang-cl +#if EIGEN_COMP_MSVC && !(EIGEN_COMP_ICC || EIGEN_COMP_LLVM || EIGEN_COMP_CLANG) #define EIGEN_COMP_MSVC_STRICT _MSC_VER #else #define EIGEN_COMP_MSVC_STRICT 0 @@ -380,7 +380,8 @@ #if EIGEN_MAX_CPP_VER>=11 && \ ((defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901)) \ || (defined(__GNUC__) && defined(_GLIBCXX_USE_C99)) \ - || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER))) + || (defined(_LIBCPP_VERSION) && !defined(_MSC_VER)) \ + || (EIGEN_COMP_MSVC >= 1900) ) #define EIGEN_HAS_C99_MATH 1 #else #define EIGEN_HAS_C99_MATH 0 @@ -396,10 +397,24 @@ #endif #endif +// Does the compiler support type_traits? +// - full support of type traits was added only to GCC 5.1.0. +// - 20150626 corresponds to the last release of 4.x libstdc++ +#ifndef EIGEN_HAS_TYPE_TRAITS +#if EIGEN_MAX_CPP_VER>=11 && (EIGEN_HAS_CXX11 || EIGEN_COMP_MSVC >= 1700) \ + && ((!EIGEN_COMP_GNUC_STRICT) || EIGEN_GNUC_AT_LEAST(5, 1)) \ + && ((!defined(__GLIBCXX__)) || __GLIBCXX__ > 20150626) +#define EIGEN_HAS_TYPE_TRAITS 1 +#define EIGEN_INCLUDE_TYPE_TRAITS +#else +#define EIGEN_HAS_TYPE_TRAITS 0 +#endif +#endif + // Does the compiler support variadic templates? #ifndef EIGEN_HAS_VARIADIC_TEMPLATES #if EIGEN_MAX_CPP_VER>=11 && (__cplusplus > 199711L || EIGEN_COMP_MSVC >= 1900) \ - && ( !defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (defined __CUDACC_VER__ && __CUDACC_VER__ >= 80000) ) + && (!defined(__NVCC__) || !EIGEN_ARCH_ARM_OR_ARM64 || (EIGEN_CUDACC_VER >= 80000) ) // ^^ Disable the use of variadic templates when compiling with versions of nvcc older than 8.0 on ARM devices: // this prevents nvcc from crashing when compiling Eigen on Tegra X1 #define EIGEN_HAS_VARIADIC_TEMPLATES 1 @@ -413,7 +428,7 @@ #ifdef __CUDACC__ // Const expressions are supported provided that c++11 is enabled and we're using either clang or nvcc 7.5 or above -#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && defined(__CUDACC_VER__) && (EIGEN_COMP_CLANG || __CUDACC_VER__ >= 70500)) +#if EIGEN_MAX_CPP_VER>=14 && (__cplusplus > 199711L && (EIGEN_COMP_CLANG || EIGEN_CUDACC_VER >= 70500)) #define EIGEN_HAS_CONSTEXPR 1 #endif #elif EIGEN_MAX_CPP_VER>=14 && (__has_feature(cxx_relaxed_constexpr) || (defined(__cplusplus) && __cplusplus >= 201402L) || \ @@ -487,11 +502,13 @@ // EIGEN_STRONG_INLINE is a stronger version of the inline, using __forceinline on MSVC, // but it still doesn't use GCC's always_inline. This is useful in (common) situations where MSVC needs forceinline // but GCC is still doing fine with just inline. +#ifndef EIGEN_STRONG_INLINE #if EIGEN_COMP_MSVC || EIGEN_COMP_ICC #define EIGEN_STRONG_INLINE __forceinline #else #define EIGEN_STRONG_INLINE inline #endif +#endif // EIGEN_ALWAYS_INLINE is the stronget, it has the effect of making the function inline and adding every possible // attribute to maximize inlining. This should only be used when really necessary: in particular, @@ -812,7 +829,8 @@ namespace Eigen { // just an empty macro ! #define EIGEN_EMPTY -#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || __CUDACC_VER__) // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) +#if EIGEN_COMP_MSVC_STRICT && (EIGEN_COMP_MSVC < 1900 || EIGEN_CUDACC_VER>0) + // for older MSVC versions, as well as 1900 && CUDA 8, using the base operator is sufficient (cf Bugs 1000, 1324) #define EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ using Base::operator =; #elif EIGEN_COMP_CLANG // workaround clang bug (see http://forum.kde.org/viewtopic.php?f=74&t=102653) @@ -832,11 +850,48 @@ namespace Eigen { #endif +/** + * \internal + * \brief Macro to explicitly define the default copy constructor. + * This is necessary, because the implicit definition is deprecated if the copy-assignment is overridden. + */ +#if EIGEN_HAS_CXX11 +#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) EIGEN_DEVICE_FUNC CLASS(const CLASS&) = default; +#else +#define EIGEN_DEFAULT_COPY_CONSTRUCTOR(CLASS) +#endif + + + /** \internal * \brief Macro to manually inherit assignment operators. * This is necessary, because the implicitly defined assignment operator gets deleted when a custom operator= is defined. + * With C++11 or later this also default-implements the copy-constructor */ -#define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) +#define EIGEN_INHERIT_ASSIGNMENT_OPERATORS(Derived) \ + EIGEN_INHERIT_ASSIGNMENT_EQUAL_OPERATOR(Derived) \ + EIGEN_DEFAULT_COPY_CONSTRUCTOR(Derived) + +/** \internal + * \brief Macro to manually define default constructors and destructors. + * This is necessary when the copy constructor is re-defined. + * For empty helper classes this should usually be protected, to avoid accidentally creating empty objects. + * + * Hiding the default destructor lead to problems in C++03 mode together with boost::multiprecision + */ +#if EIGEN_HAS_CXX11 +#define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \ + EIGEN_DEVICE_FUNC Derived() = default; \ + EIGEN_DEVICE_FUNC ~Derived() = default; +#else +#define EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(Derived) \ + EIGEN_DEVICE_FUNC Derived() {}; \ + /* EIGEN_DEVICE_FUNC ~Derived() {}; */ +#endif + + + + /** * Just a side note. Commenting within defines works only by documenting @@ -986,7 +1041,13 @@ namespace Eigen { # define EIGEN_NOEXCEPT # define EIGEN_NOEXCEPT_IF(x) # define EIGEN_NO_THROW throw() -# define EIGEN_EXCEPTION_SPEC(X) throw(X) +# if EIGEN_COMP_MSVC + // MSVC does not support exception specifications (warning C4290), + // and they are deprecated in c++11 anyway. +# define EIGEN_EXCEPTION_SPEC(X) throw() +# else +# define EIGEN_EXCEPTION_SPEC(X) throw(X) +# endif #endif #endif // EIGEN_MACROS_H diff --git a/eigenlib/Eigen/src/Core/util/Memory.h b/eigenlib/Eigen/src/Core/util/Memory.h index 67053db6..291383c5 100644 --- a/eigenlib/Eigen/src/Core/util/Memory.h +++ b/eigenlib/Eigen/src/Core/util/Memory.h @@ -70,7 +70,7 @@ inline void throw_std_bad_alloc() throw std::bad_alloc(); #else std::size_t huge = static_cast(-1); - new int[huge]; + ::operator new(huge); #endif } @@ -150,7 +150,7 @@ EIGEN_DEVICE_FUNC inline void check_that_malloc_is_allowed() /** \internal Allocates \a size bytes. The returned pointer is guaranteed to have 16 or 32 bytes alignment depending on the requirements. * On allocation error, the returned pointer is null, and std::bad_alloc is thrown. */ -EIGEN_DEVICE_FUNC inline void* aligned_malloc(size_t size) +EIGEN_DEVICE_FUNC inline void* aligned_malloc(std::size_t size) { check_that_malloc_is_allowed(); @@ -185,7 +185,7 @@ EIGEN_DEVICE_FUNC inline void aligned_free(void *ptr) * \brief Reallocates an aligned block of memory. * \throws std::bad_alloc on allocation failure */ -inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size) +inline void* aligned_realloc(void *ptr, std::size_t new_size, std::size_t old_size) { EIGEN_UNUSED_VARIABLE(old_size); @@ -209,12 +209,12 @@ inline void* aligned_realloc(void *ptr, size_t new_size, size_t old_size) /** \internal Allocates \a size bytes. If Align is true, then the returned ptr is 16-byte-aligned. * On allocation error, the returned pointer is null, and a std::bad_alloc is thrown. */ -template EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(size_t size) +template EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(std::size_t size) { return aligned_malloc(size); } -template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(size_t size) +template<> EIGEN_DEVICE_FUNC inline void* conditional_aligned_malloc(std::size_t size) { check_that_malloc_is_allowed(); @@ -235,12 +235,12 @@ template<> EIGEN_DEVICE_FUNC inline void conditional_aligned_free(void *p std::free(ptr); } -template inline void* conditional_aligned_realloc(void* ptr, size_t new_size, size_t old_size) +template inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t old_size) { return aligned_realloc(ptr, new_size, old_size); } -template<> inline void* conditional_aligned_realloc(void* ptr, size_t new_size, size_t) +template<> inline void* conditional_aligned_realloc(void* ptr, std::size_t new_size, std::size_t) { return std::realloc(ptr, new_size); } @@ -252,7 +252,7 @@ template<> inline void* conditional_aligned_realloc(void* ptr, size_t new /** \internal Destructs the elements of an array. * The \a size parameters tells on how many objects to call the destructor of T. */ -template EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, size_t size) +template EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T *ptr, std::size_t size) { // always destruct an array starting from the end. if(ptr) @@ -262,9 +262,9 @@ template EIGEN_DEVICE_FUNC inline void destruct_elements_of_array(T /** \internal Constructs the elements of an array. * The \a size parameter tells on how many objects to call the constructor of T. */ -template EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, size_t size) +template EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T *ptr, std::size_t size) { - size_t i; + std::size_t i; EIGEN_TRY { for (i = 0; i < size; ++i) ::new (ptr + i) T; @@ -283,9 +283,9 @@ template EIGEN_DEVICE_FUNC inline T* construct_elements_of_array(T * *****************************************************************************/ template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size) +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(std::size_t size) { - if(size > size_t(-1) / sizeof(T)) + if(size > std::size_t(-1) / sizeof(T)) throw_std_bad_alloc(); } @@ -293,7 +293,7 @@ EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE void check_size_for_overflow(size_t size) * On allocation error, the returned pointer is undefined, but a std::bad_alloc is thrown. * The default constructor of T is called. */ -template EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size) +template EIGEN_DEVICE_FUNC inline T* aligned_new(std::size_t size) { check_size_for_overflow(size); T *result = reinterpret_cast(aligned_malloc(sizeof(T)*size)); @@ -309,7 +309,7 @@ template EIGEN_DEVICE_FUNC inline T* aligned_new(size_t size) return result; } -template EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(size_t size) +template EIGEN_DEVICE_FUNC inline T* conditional_aligned_new(std::size_t size) { check_size_for_overflow(size); T *result = reinterpret_cast(conditional_aligned_malloc(sizeof(T)*size)); @@ -328,7 +328,7 @@ template EIGEN_DEVICE_FUNC inline T* conditional_aligned /** \internal Deletes objects constructed with aligned_new * The \a size parameters tells on how many objects to call the destructor of T. */ -template EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, size_t size) +template EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, std::size_t size) { destruct_elements_of_array(ptr, size); aligned_free(ptr); @@ -337,13 +337,13 @@ template EIGEN_DEVICE_FUNC inline void aligned_delete(T *ptr, size_t /** \internal Deletes objects constructed with conditional_aligned_new * The \a size parameters tells on how many objects to call the destructor of T. */ -template EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, size_t size) +template EIGEN_DEVICE_FUNC inline void conditional_aligned_delete(T *ptr, std::size_t size) { destruct_elements_of_array(ptr, size); conditional_aligned_free(ptr); } -template EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, size_t new_size, size_t old_size) +template EIGEN_DEVICE_FUNC inline T* conditional_aligned_realloc_new(T* pts, std::size_t new_size, std::size_t old_size) { check_size_for_overflow(new_size); check_size_for_overflow(old_size); @@ -366,7 +366,7 @@ template EIGEN_DEVICE_FUNC inline T* conditional_aligned } -template EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(size_t size) +template EIGEN_DEVICE_FUNC inline T* conditional_aligned_new_auto(std::size_t size) { if(size==0) return 0; // short-cut. Also fixes Bug 884 @@ -387,7 +387,7 @@ template EIGEN_DEVICE_FUNC inline T* conditional_aligned return result; } -template inline T* conditional_aligned_realloc_new_auto(T* pts, size_t new_size, size_t old_size) +template inline T* conditional_aligned_realloc_new_auto(T* pts, std::size_t new_size, std::size_t old_size) { check_size_for_overflow(new_size); check_size_for_overflow(old_size); @@ -409,7 +409,7 @@ template inline T* conditional_aligned_realloc_new_auto( return result; } -template EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, size_t size) +template EIGEN_DEVICE_FUNC inline void conditional_aligned_delete_auto(T *ptr, std::size_t size) { if(NumTraits::RequireInitialization) destruct_elements_of_array(ptr, size); @@ -493,7 +493,7 @@ template struct smart_copy_helper { IntPtr size = IntPtr(end)-IntPtr(start); if(size==0) return; eigen_internal_assert(start!=0 && end!=0 && target!=0); - memcpy(target, start, size); + std::memcpy(target, start, size); } }; @@ -561,7 +561,7 @@ template class aligned_stack_memory_handler : noncopyable * In this case, the buffer elements will also be destructed when this handler will be destructed. * Finally, if \a dealloc is true, then the pointer \a ptr is freed. **/ - aligned_stack_memory_handler(T* ptr, size_t size, bool dealloc) + aligned_stack_memory_handler(T* ptr, std::size_t size, bool dealloc) : m_ptr(ptr), m_size(size), m_deallocate(dealloc) { if(NumTraits::RequireInitialization && m_ptr) @@ -576,7 +576,7 @@ template class aligned_stack_memory_handler : noncopyable } protected: T* m_ptr; - size_t m_size; + std::size_t m_size; bool m_deallocate; }; @@ -655,15 +655,15 @@ template void swap(scoped_array &a,scoped_array &b) #if EIGEN_MAX_ALIGN_BYTES!=0 #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_NOTHROW(NeedsToAlign) \ - void* operator new(size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \ + void* operator new(std::size_t size, const std::nothrow_t&) EIGEN_NO_THROW { \ EIGEN_TRY { return Eigen::internal::conditional_aligned_malloc(size); } \ EIGEN_CATCH (...) { return 0; } \ } #define EIGEN_MAKE_ALIGNED_OPERATOR_NEW_IF(NeedsToAlign) \ - void *operator new(size_t size) { \ + void *operator new(std::size_t size) { \ return Eigen::internal::conditional_aligned_malloc(size); \ } \ - void *operator new[](size_t size) { \ + void *operator new[](std::size_t size) { \ return Eigen::internal::conditional_aligned_malloc(size); \ } \ void operator delete(void * ptr) EIGEN_NO_THROW { Eigen::internal::conditional_aligned_free(ptr); } \ @@ -673,8 +673,8 @@ template void swap(scoped_array &a,scoped_array &b) /* in-place new and delete. since (at least afaik) there is no actual */ \ /* memory allocated we can safely let the default implementation handle */ \ /* this particular case. */ \ - static void *operator new(size_t size, void *ptr) { return ::operator new(size,ptr); } \ - static void *operator new[](size_t size, void* ptr) { return ::operator new[](size,ptr); } \ + static void *operator new(std::size_t size, void *ptr) { return ::operator new(size,ptr); } \ + static void *operator new[](std::size_t size, void* ptr) { return ::operator new[](size,ptr); } \ void operator delete(void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete(memory,ptr); } \ void operator delete[](void * memory, void *ptr) EIGEN_NO_THROW { return ::operator delete[](memory,ptr); } \ /* nothrow-new (returns zero instead of std::bad_alloc) */ \ @@ -696,7 +696,15 @@ template void swap(scoped_array &a,scoped_array &b) /** \class aligned_allocator * \ingroup Core_Module * -* \brief STL compatible allocator to use with with 16 byte aligned types +* \brief STL compatible allocator to use with types requiring a non standrad alignment. +* +* The memory is aligned as for dynamically aligned matrix/array types such as MatrixXd. +* By default, it will thus provide at least 16 bytes alignment and more in following cases: +* - 32 bytes alignment if AVX is enabled. +* - 64 bytes alignment if AVX512 is enabled. +* +* This can be controled using the \c EIGEN_MAX_ALIGN_BYTES macro as documented +* \link TopicPreprocessorDirectivesPerformance there \endlink. * * Example: * \code @@ -713,7 +721,7 @@ template class aligned_allocator : public std::allocator { public: - typedef size_t size_type; + typedef std::size_t size_type; typedef std::ptrdiff_t difference_type; typedef T* pointer; typedef const T* const_pointer; @@ -739,7 +747,15 @@ public: pointer allocate(size_type num, const void* /*hint*/ = 0) { internal::check_size_for_overflow(num); - return static_cast( internal::aligned_malloc(num * sizeof(T)) ); + size_type size = num * sizeof(T); +#if EIGEN_COMP_GNUC_STRICT && EIGEN_GNUC_AT_LEAST(7,0) + // workaround gcc bug https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87544 + // It triggered eigen/Eigen/src/Core/util/Memory.h:189:12: warning: argument 1 value '18446744073709551612' exceeds maximum object size 9223372036854775807 + if(size>=std::size_t((std::numeric_limits::max)())) + return 0; + else +#endif + return static_cast( internal::aligned_malloc(size) ); } void deallocate(pointer p, size_type /*num*/) diff --git a/eigenlib/Eigen/src/Core/util/Meta.h b/eigenlib/Eigen/src/Core/util/Meta.h index 7f637075..9b61ff03 100755 --- a/eigenlib/Eigen/src/Core/util/Meta.h +++ b/eigenlib/Eigen/src/Core/util/Meta.h @@ -97,6 +97,9 @@ template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; template<> struct is_arithmetic { enum { value = true }; }; +#if EIGEN_HAS_CXX11 +using std::is_integral; +#else template struct is_integral { enum { value = false }; }; template<> struct is_integral { enum { value = true }; }; template<> struct is_integral { enum { value = true }; }; @@ -108,6 +111,33 @@ template<> struct is_integral { enum { value = true }; }; template<> struct is_integral { enum { value = true }; }; template<> struct is_integral { enum { value = true }; }; template<> struct is_integral { enum { value = true }; }; +#if EIGEN_COMP_MSVC +template<> struct is_integral { enum { value = true }; }; +template<> struct is_integral{ enum { value = true }; }; +#endif +#endif + +#if EIGEN_HAS_CXX11 +using std::make_unsigned; +#else +// TODO: Possibly improve this implementation of make_unsigned. +// It is currently used only by +// template struct random_default_impl. +template struct make_unsigned; +template<> struct make_unsigned { typedef unsigned char type; }; +template<> struct make_unsigned { typedef unsigned char type; }; +template<> struct make_unsigned { typedef unsigned char type; }; +template<> struct make_unsigned { typedef unsigned short type; }; +template<> struct make_unsigned { typedef unsigned short type; }; +template<> struct make_unsigned { typedef unsigned int type; }; +template<> struct make_unsigned { typedef unsigned int type; }; +template<> struct make_unsigned { typedef unsigned long type; }; +template<> struct make_unsigned { typedef unsigned long type; }; +#if EIGEN_COMP_MSVC +template<> struct make_unsigned { typedef unsigned __int64 type; }; +template<> struct make_unsigned { typedef unsigned __int64 type; }; +#endif +#endif template struct add_const { typedef const T type; }; template struct add_const { typedef T& type; }; @@ -485,8 +515,54 @@ T div_ceil(const T &a, const T &b) return (a+b-1) / b; } +// The aim of the following functions is to bypass -Wfloat-equal warnings +// when we really want a strict equality comparison on floating points. +template EIGEN_STRONG_INLINE +bool equal_strict(const X& x,const Y& y) { return x == y; } + +template<> EIGEN_STRONG_INLINE +bool equal_strict(const float& x,const float& y) { return std::equal_to()(x,y); } + +template<> EIGEN_STRONG_INLINE +bool equal_strict(const double& x,const double& y) { return std::equal_to()(x,y); } + +template EIGEN_STRONG_INLINE +bool not_equal_strict(const X& x,const Y& y) { return x != y; } + +template<> EIGEN_STRONG_INLINE +bool not_equal_strict(const float& x,const float& y) { return std::not_equal_to()(x,y); } + +template<> EIGEN_STRONG_INLINE +bool not_equal_strict(const double& x,const double& y) { return std::not_equal_to()(x,y); } + } // end namespace numext } // end namespace Eigen +// Define portable (u)int{32,64} types +#if EIGEN_HAS_CXX11 +#include +namespace Eigen { +namespace numext { +typedef std::uint32_t uint32_t; +typedef std::int32_t int32_t; +typedef std::uint64_t uint64_t; +typedef std::int64_t int64_t; +} +} +#else +// Without c++11, all compilers able to compile Eigen also +// provides the C99 stdint.h header file. +#include +namespace Eigen { +namespace numext { +typedef ::uint32_t uint32_t; +typedef ::int32_t int32_t; +typedef ::uint64_t uint64_t; +typedef ::int64_t int64_t; +} +} +#endif + + #endif // EIGEN_META_H diff --git a/eigenlib/Eigen/src/Core/util/ReenableStupidWarnings.h b/eigenlib/Eigen/src/Core/util/ReenableStupidWarnings.h index 86b60f52..1ce6fd1b 100644 --- a/eigenlib/Eigen/src/Core/util/ReenableStupidWarnings.h +++ b/eigenlib/Eigen/src/Core/util/ReenableStupidWarnings.h @@ -1,4 +1,8 @@ -#ifdef EIGEN_WARNINGS_DISABLED +#ifdef EIGEN_WARNINGS_DISABLED_2 +// "DisableStupidWarnings.h" was included twice recursively: Do not reenable warnings yet! +# undef EIGEN_WARNINGS_DISABLED_2 + +#elif defined(EIGEN_WARNINGS_DISABLED) #undef EIGEN_WARNINGS_DISABLED #ifndef EIGEN_PERMANENTLY_DISABLE_STUPID_WARNINGS @@ -8,7 +12,7 @@ #pragma warning pop #elif defined __clang__ #pragma clang diagnostic pop - #elif defined __GNUC__ && __GNUC__>=6 + #elif defined __GNUC__ && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)) #pragma GCC diagnostic pop #endif diff --git a/eigenlib/Eigen/src/Core/util/StaticAssert.h b/eigenlib/Eigen/src/Core/util/StaticAssert.h index 983361a4..500e4779 100644 --- a/eigenlib/Eigen/src/Core/util/StaticAssert.h +++ b/eigenlib/Eigen/src/Core/util/StaticAssert.h @@ -24,6 +24,7 @@ * */ +#ifndef EIGEN_STATIC_ASSERT #ifndef EIGEN_NO_STATIC_ASSERT #if EIGEN_MAX_CPP_VER>=11 && (__has_feature(cxx_static_assert) || (defined(__cplusplus) && __cplusplus >= 201103L) || (EIGEN_COMP_MSVC >= 1600)) @@ -44,64 +45,65 @@ struct static_assertion { enum { - YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX, - YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES, - YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES, - THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE, - THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE, - THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE, - OUT_OF_RANGE_ACCESS, - YOU_MADE_A_PROGRAMMING_MISTAKE, - EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT, - EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE, - YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR, - YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR, - UNALIGNED_LOAD_AND_STORE_OPERATIONS_UNIMPLEMENTED_ON_ALTIVEC, - THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES, - FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED, - NUMERIC_TYPE_MUST_BE_REAL, - COEFFICIENT_WRITE_ACCESS_TO_SELFADJOINT_NOT_SUPPORTED, - WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED, - THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE, - INVALID_MATRIX_PRODUCT, - INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS, - INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION, - YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY, - THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES, - THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES, - INVALID_MATRIX_TEMPLATE_PARAMETERS, - INVALID_MATRIXBASE_TEMPLATE_PARAMETERS, - BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER, - THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX, - THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE, - THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES, - YOU_ALREADY_SPECIFIED_THIS_STRIDE, - INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION, - THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD, - PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1, - THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS, - YOU_CANNOT_MIX_ARRAYS_AND_MATRICES, - YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION, - THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY, - YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT, - THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS, - THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS, - THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL, - THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES, - YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED, - YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED, - THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE, - THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH, - OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG, - IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY, - STORAGE_LAYOUT_DOES_NOT_MATCH, - EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE, - THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS, - MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY, - THIS_TYPE_IS_NOT_SUPPORTED, - STORAGE_KIND_MUST_MATCH, - STORAGE_INDEX_MUST_MATCH, - CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY + YOU_TRIED_CALLING_A_VECTOR_METHOD_ON_A_MATRIX=1, + YOU_MIXED_VECTORS_OF_DIFFERENT_SIZES=1, + YOU_MIXED_MATRICES_OF_DIFFERENT_SIZES=1, + THIS_METHOD_IS_ONLY_FOR_VECTORS_OF_A_SPECIFIC_SIZE=1, + THIS_METHOD_IS_ONLY_FOR_MATRICES_OF_A_SPECIFIC_SIZE=1, + THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE=1, + OUT_OF_RANGE_ACCESS=1, + YOU_MADE_A_PROGRAMMING_MISTAKE=1, + EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT=1, + EIGEN_INTERNAL_COMPILATION_ERROR_OR_YOU_MADE_A_PROGRAMMING_MISTAKE=1, + YOU_CALLED_A_FIXED_SIZE_METHOD_ON_A_DYNAMIC_SIZE_MATRIX_OR_VECTOR=1, + YOU_CALLED_A_DYNAMIC_SIZE_METHOD_ON_A_FIXED_SIZE_MATRIX_OR_VECTOR=1, + UNALIGNED_LOAD_AND_STORE_OPERATIONS_UNIMPLEMENTED_ON_ALTIVEC=1, + THIS_FUNCTION_IS_NOT_FOR_INTEGER_NUMERIC_TYPES=1, + FLOATING_POINT_ARGUMENT_PASSED__INTEGER_WAS_EXPECTED=1, + NUMERIC_TYPE_MUST_BE_REAL=1, + COEFFICIENT_WRITE_ACCESS_TO_SELFADJOINT_NOT_SUPPORTED=1, + WRITING_TO_TRIANGULAR_PART_WITH_UNIT_DIAGONAL_IS_NOT_SUPPORTED=1, + THIS_METHOD_IS_ONLY_FOR_FIXED_SIZE=1, + INVALID_MATRIX_PRODUCT=1, + INVALID_VECTOR_VECTOR_PRODUCT__IF_YOU_WANTED_A_DOT_OR_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTIONS=1, + INVALID_MATRIX_PRODUCT__IF_YOU_WANTED_A_COEFF_WISE_PRODUCT_YOU_MUST_USE_THE_EXPLICIT_FUNCTION=1, + YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY=1, + THIS_METHOD_IS_ONLY_FOR_COLUMN_MAJOR_MATRICES=1, + THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES=1, + INVALID_MATRIX_TEMPLATE_PARAMETERS=1, + INVALID_MATRIXBASE_TEMPLATE_PARAMETERS=1, + BOTH_MATRICES_MUST_HAVE_THE_SAME_STORAGE_ORDER=1, + THIS_METHOD_IS_ONLY_FOR_DIAGONAL_MATRIX=1, + THE_MATRIX_OR_EXPRESSION_THAT_YOU_PASSED_DOES_NOT_HAVE_THE_EXPECTED_TYPE=1, + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_WITH_DIRECT_MEMORY_ACCESS_SUCH_AS_MAP_OR_PLAIN_MATRICES=1, + YOU_ALREADY_SPECIFIED_THIS_STRIDE=1, + INVALID_STORAGE_ORDER_FOR_THIS_VECTOR_EXPRESSION=1, + THE_BRACKET_OPERATOR_IS_ONLY_FOR_VECTORS__USE_THE_PARENTHESIS_OPERATOR_INSTEAD=1, + PACKET_ACCESS_REQUIRES_TO_HAVE_INNER_STRIDE_FIXED_TO_1=1, + THIS_METHOD_IS_ONLY_FOR_SPECIFIC_TRANSFORMATIONS=1, + YOU_CANNOT_MIX_ARRAYS_AND_MATRICES=1, + YOU_PERFORMED_AN_INVALID_TRANSFORMATION_CONVERSION=1, + THIS_EXPRESSION_IS_NOT_A_LVALUE__IT_IS_READ_ONLY=1, + YOU_ARE_TRYING_TO_USE_AN_INDEX_BASED_ACCESSOR_ON_AN_EXPRESSION_THAT_DOES_NOT_SUPPORT_THAT=1, + THIS_METHOD_IS_ONLY_FOR_1x1_EXPRESSIONS=1, + THIS_METHOD_IS_ONLY_FOR_INNER_OR_LAZY_PRODUCTS=1, + THIS_METHOD_IS_ONLY_FOR_EXPRESSIONS_OF_BOOL=1, + THIS_METHOD_IS_ONLY_FOR_ARRAYS_NOT_MATRICES=1, + YOU_PASSED_A_ROW_VECTOR_BUT_A_COLUMN_VECTOR_WAS_EXPECTED=1, + YOU_PASSED_A_COLUMN_VECTOR_BUT_A_ROW_VECTOR_WAS_EXPECTED=1, + THE_INDEX_TYPE_MUST_BE_A_SIGNED_TYPE=1, + THE_STORAGE_ORDER_OF_BOTH_SIDES_MUST_MATCH=1, + OBJECT_ALLOCATED_ON_STACK_IS_TOO_BIG=1, + IMPLICIT_CONVERSION_TO_SCALAR_IS_FOR_INNER_PRODUCT_ONLY=1, + STORAGE_LAYOUT_DOES_NOT_MATCH=1, + EIGEN_INTERNAL_ERROR_PLEASE_FILE_A_BUG_REPORT__INVALID_COST_VALUE=1, + THIS_COEFFICIENT_ACCESSOR_TAKING_ONE_ACCESS_IS_ONLY_FOR_EXPRESSIONS_ALLOWING_LINEAR_ACCESS=1, + MATRIX_FREE_CONJUGATE_GRADIENT_IS_COMPATIBLE_WITH_UPPER_UNION_LOWER_MODE_ONLY=1, + THIS_TYPE_IS_NOT_SUPPORTED=1, + STORAGE_KIND_MUST_MATCH=1, + STORAGE_INDEX_MUST_MATCH=1, + CHOLMOD_SUPPORTS_DOUBLE_PRECISION_ONLY=1, + SELFADJOINTVIEW_ACCEPTS_UPPER_AND_LOWER_MODE_ONLY=1 }; }; @@ -131,7 +133,7 @@ #define EIGEN_STATIC_ASSERT(CONDITION,MSG) eigen_assert((CONDITION) && #MSG); #endif // EIGEN_NO_STATIC_ASSERT - +#endif // EIGEN_STATIC_ASSERT // static assertion failing if the type \a TYPE is not a vector type #define EIGEN_STATIC_ASSERT_VECTOR_ONLY(TYPE) \ diff --git a/eigenlib/Eigen/src/Core/util/XprHelper.h b/eigenlib/Eigen/src/Core/util/XprHelper.h index e88510dc..6bb49708 100644 --- a/eigenlib/Eigen/src/Core/util/XprHelper.h +++ b/eigenlib/Eigen/src/Core/util/XprHelper.h @@ -34,6 +34,20 @@ inline IndexDest convert_index(const IndexSrc& idx) { return IndexDest(idx); } +// true if T can be considered as an integral index (i.e., and integral type or enum) +template struct is_valid_index_type +{ + enum { value = +#if EIGEN_HAS_TYPE_TRAITS + internal::is_integral::value || std::is_enum::value +#elif EIGEN_COMP_MSVC + internal::is_integral::value || __is_enum(T) +#else + // without C++11, we use is_convertible to Index instead of is_integral in order to treat enums as Index. + internal::is_convertible::value && !internal::is_same::value && !is_same::value +#endif + }; +}; // promote_scalar_arg is an helper used in operation between an expression and a scalar, like: // expression * scalar @@ -90,6 +104,9 @@ class no_assignment_operator { private: no_assignment_operator& operator=(const no_assignment_operator&); + protected: + EIGEN_DEFAULT_COPY_CONSTRUCTOR(no_assignment_operator) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(no_assignment_operator) }; /** \internal return the index type with the largest number of bits */ @@ -638,7 +655,7 @@ struct plain_constant_type template struct is_lvalue { - enum { value = !bool(is_const::value) && + enum { value = (!bool(is_const::value)) && bool(traits::Flags & LvalueBit) }; }; diff --git a/eigenlib/Eigen/src/Eigenvalues/ComplexEigenSolver.h b/eigenlib/Eigen/src/Eigenvalues/ComplexEigenSolver.h index ec3b1633..dc5fae06 100644 --- a/eigenlib/Eigen/src/Eigenvalues/ComplexEigenSolver.h +++ b/eigenlib/Eigen/src/Eigenvalues/ComplexEigenSolver.h @@ -250,7 +250,7 @@ template class ComplexEigenSolver EigenvectorType m_matX; private: - void doComputeEigenvectors(const RealScalar& matrixnorm); + void doComputeEigenvectors(RealScalar matrixnorm); void sortEigenvalues(bool computeEigenvectors); }; @@ -284,10 +284,12 @@ ComplexEigenSolver::compute(const EigenBase& matrix, bool template -void ComplexEigenSolver::doComputeEigenvectors(const RealScalar& matrixnorm) +void ComplexEigenSolver::doComputeEigenvectors(RealScalar matrixnorm) { const Index n = m_eivalues.size(); + matrixnorm = numext::maxi(matrixnorm,(std::numeric_limits::min)()); + // Compute X such that T = X D X^(-1), where D is the diagonal of T. // The matrix X is unit triangular. m_matX = EigenvectorType::Zero(n, n); diff --git a/eigenlib/Eigen/src/Eigenvalues/ComplexSchur.h b/eigenlib/Eigen/src/Eigenvalues/ComplexSchur.h index 7f38919f..4354e401 100644 --- a/eigenlib/Eigen/src/Eigenvalues/ComplexSchur.h +++ b/eigenlib/Eigen/src/Eigenvalues/ComplexSchur.h @@ -300,10 +300,13 @@ typename ComplexSchur::ComplexScalar ComplexSchur::compu ComplexScalar trace = t.coeff(0,0) + t.coeff(1,1); ComplexScalar eival1 = (trace + disc) / RealScalar(2); ComplexScalar eival2 = (trace - disc) / RealScalar(2); - - if(numext::norm1(eival1) > numext::norm1(eival2)) + RealScalar eival1_norm = numext::norm1(eival1); + RealScalar eival2_norm = numext::norm1(eival2); + // A division by zero can only occur if eival1==eival2==0. + // In this case, det==0, and all we have to do is checking that eival2_norm!=0 + if(eival1_norm > eival2_norm) eival2 = det / eival1; - else + else if(eival2_norm!=RealScalar(0)) eival1 = det / eival2; // choose the eigenvalue closest to the bottom entry of the diagonal diff --git a/eigenlib/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h b/eigenlib/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h index 36a91dff..87d789b3 100644 --- a/eigenlib/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h +++ b/eigenlib/Eigen/src/Eigenvalues/GeneralizedEigenSolver.h @@ -311,7 +311,6 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp // Aliases: Map v(reinterpret_cast(m_tmp.data()), size); ComplexVectorType &cv = m_tmp; - const MatrixType &mZ = m_realQZ.matrixZ(); const MatrixType &mS = m_realQZ.matrixS(); const MatrixType &mT = m_realQZ.matrixT(); @@ -351,7 +350,7 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp } } } - m_eivec.col(i).real().noalias() = mZ.transpose() * v; + m_eivec.col(i).real().noalias() = m_realQZ.matrixZ().transpose() * v; m_eivec.col(i).real().normalize(); m_eivec.col(i).imag().setConstant(0); } @@ -400,7 +399,7 @@ GeneralizedEigenSolver::compute(const MatrixType& A, const MatrixTyp / (alpha*mT.coeffRef(j,j) - static_cast(beta*mS.coeffRef(j,j))); } } - m_eivec.col(i+1).noalias() = (mZ.transpose() * cv); + m_eivec.col(i+1).noalias() = (m_realQZ.matrixZ().transpose() * cv); m_eivec.col(i+1).normalize(); m_eivec.col(i) = m_eivec.col(i+1).conjugate(); } diff --git a/eigenlib/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h b/eigenlib/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h index 4fec8af0..e4e42607 100644 --- a/eigenlib/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h +++ b/eigenlib/Eigen/src/Eigenvalues/MatrixBaseEigenvalues.h @@ -66,7 +66,6 @@ template inline typename MatrixBase::EigenvaluesReturnType MatrixBase::eigenvalues() const { - typedef typename internal::traits::Scalar Scalar; return internal::eigenvalues_selector::IsComplex>::run(derived()); } @@ -88,7 +87,6 @@ template inline typename SelfAdjointView::EigenvaluesReturnType SelfAdjointView::eigenvalues() const { - typedef typename SelfAdjointView::PlainObject PlainObject; PlainObject thisAsMatrix(*this); return SelfAdjointEigenSolver(thisAsMatrix, false).eigenvalues(); } diff --git a/eigenlib/Eigen/src/Eigenvalues/RealSchur.h b/eigenlib/Eigen/src/Eigenvalues/RealSchur.h index d6a339f0..9191519a 100644 --- a/eigenlib/Eigen/src/Eigenvalues/RealSchur.h +++ b/eigenlib/Eigen/src/Eigenvalues/RealSchur.h @@ -236,7 +236,7 @@ template class RealSchur typedef Matrix Vector3s; Scalar computeNormOfT(); - Index findSmallSubdiagEntry(Index iu); + Index findSmallSubdiagEntry(Index iu, const Scalar& considerAsZero); void splitOffTwoRows(Index iu, bool computeU, const Scalar& exshift); void computeShift(Index iu, Index iter, Scalar& exshift, Vector3s& shiftInfo); void initFrancisQRStep(Index il, Index iu, const Vector3s& shiftInfo, Index& im, Vector3s& firstHouseholderVector); @@ -248,12 +248,24 @@ template template RealSchur& RealSchur::compute(const EigenBase& matrix, bool computeU) { + const Scalar considerAsZero = (std::numeric_limits::min)(); + eigen_assert(matrix.cols() == matrix.rows()); Index maxIters = m_maxIters; if (maxIters == -1) maxIters = m_maxIterationsPerRow * matrix.rows(); Scalar scale = matrix.derived().cwiseAbs().maxCoeff(); + if(scale& RealSchur::computeFromHessenberg(const HessMa Index totalIter = 0; // iteration count for whole matrix Scalar exshift(0); // sum of exceptional shifts Scalar norm = computeNormOfT(); + // sub-diagonal entries smaller than considerAsZero will be treated as zero. + // We use eps^2 to enable more precision in small eigenvalues. + Scalar considerAsZero = numext::maxi( norm * numext::abs2(NumTraits::epsilon()), + (std::numeric_limits::min)() ); - if(norm!=0) + if(norm!=Scalar(0)) { while (iu >= 0) { - Index il = findSmallSubdiagEntry(iu); + Index il = findSmallSubdiagEntry(iu,considerAsZero); // Check for convergence if (il == iu) // One root found @@ -315,7 +331,7 @@ RealSchur& RealSchur::computeFromHessenberg(const HessMa else // No convergence yet { // The firstHouseholderVector vector has to be initialized to something to get rid of a silly GCC warning (-O1 -Wall -DNDEBUG ) - Vector3s firstHouseholderVector(0,0,0), shiftInfo; + Vector3s firstHouseholderVector = Vector3s::Zero(), shiftInfo; computeShift(iu, iter, exshift, shiftInfo); iter = iter + 1; totalIter = totalIter + 1; @@ -352,14 +368,17 @@ inline typename MatrixType::Scalar RealSchur::computeNormOfT() /** \internal Look for single small sub-diagonal element and returns its index */ template -inline Index RealSchur::findSmallSubdiagEntry(Index iu) +inline Index RealSchur::findSmallSubdiagEntry(Index iu, const Scalar& considerAsZero) { using std::abs; Index res = iu; while (res > 0) { Scalar s = abs(m_matT.coeff(res-1,res-1)) + abs(m_matT.coeff(res,res)); - if (abs(m_matT.coeff(res,res-1)) <= NumTraits::epsilon() * s) + + s = numext::maxi(s * NumTraits::epsilon(), considerAsZero); + + if (abs(m_matT.coeff(res,res-1)) <= s) break; res--; } diff --git a/eigenlib/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h b/eigenlib/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h index a9f56c4f..d37656fa 100644 --- a/eigenlib/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h +++ b/eigenlib/Eigen/src/Eigenvalues/SelfAdjointEigenSolver.h @@ -414,7 +414,8 @@ SelfAdjointEigenSolver& SelfAdjointEigenSolver if(n==1) { - m_eivalues.coeffRef(0,0) = numext::real(matrix.diagonal()[0]); + m_eivec = matrix; + m_eivalues.coeffRef(0,0) = numext::real(m_eivec.coeff(0,0)); if(computeEigenvectors) m_eivec.setOnes(n,n); m_info = Success; @@ -604,7 +605,8 @@ template struct direct_selfadjoint_eigenvalues res, Ref representative) { - using std::abs; + EIGEN_USING_STD_MATH(sqrt) + EIGEN_USING_STD_MATH(abs) Index i0; // Find non-zero column i0 (by construction, there must exist a non zero coefficient on the diagonal): mat.diagonal().cwiseAbs().maxCoeff(&i0); @@ -615,8 +617,8 @@ template struct direct_selfadjoint_eigenvaluesn1) res = c0/std::sqrt(n0); - else res = c1/std::sqrt(n1); + if(n0>n1) res = c0/sqrt(n0); + else res = c1/sqrt(n1); return true; } diff --git a/eigenlib/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h b/eigenlib/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h index 3891cf88..b0c947dc 100644 --- a/eigenlib/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h +++ b/eigenlib/Eigen/src/Eigenvalues/SelfAdjointEigenSolver_LAPACKE.h @@ -37,7 +37,7 @@ namespace Eigen { /** \internal Specialization for the data types supported by LAPACKe */ -#define EIGEN_LAPACKE_EIG_SELFADJ(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, EIGCOLROW, LAPACKE_COLROW ) \ +#define EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, EIGCOLROW ) \ template<> template inline \ SelfAdjointEigenSolver >& \ SelfAdjointEigenSolver >::compute(const EigenBase& matrix, int options) \ @@ -47,7 +47,7 @@ SelfAdjointEigenSolver >::compute(c && (options&EigVecMask)!=EigVecMask \ && "invalid option parameter"); \ bool computeEigenvectors = (options&ComputeEigenvectors)==ComputeEigenvectors; \ - lapack_int n = internal::convert_index(matrix.cols()), lda, matrix_order, info; \ + lapack_int n = internal::convert_index(matrix.cols()), lda, info; \ m_eivalues.resize(n,1); \ m_subdiag.resize(n-1); \ m_eivec = matrix; \ @@ -63,27 +63,24 @@ SelfAdjointEigenSolver >::compute(c } \ \ lda = internal::convert_index(m_eivec.outerStride()); \ - matrix_order=LAPACKE_COLROW; \ char jobz, uplo='L'/*, range='A'*/; \ jobz = computeEigenvectors ? 'V' : 'N'; \ \ - info = LAPACKE_##LAPACKE_NAME( matrix_order, jobz, uplo, n, (LAPACKE_TYPE*)m_eivec.data(), lda, (LAPACKE_RTYPE*)m_eivalues.data() ); \ + info = LAPACKE_##LAPACKE_NAME( LAPACK_COL_MAJOR, jobz, uplo, n, (LAPACKE_TYPE*)m_eivec.data(), lda, (LAPACKE_RTYPE*)m_eivalues.data() ); \ m_info = (info==0) ? Success : NoConvergence; \ m_isInitialized = true; \ m_eigenvectorsOk = computeEigenvectors; \ return *this; \ } +#define EIGEN_LAPACKE_EIG_SELFADJ(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME ) \ + EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, ColMajor ) \ + EIGEN_LAPACKE_EIG_SELFADJ_2(EIGTYPE, LAPACKE_TYPE, LAPACKE_RTYPE, LAPACKE_NAME, RowMajor ) -EIGEN_LAPACKE_EIG_SELFADJ(double, double, double, dsyev, ColMajor, LAPACK_COL_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(float, float, float, ssyev, ColMajor, LAPACK_COL_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev, ColMajor, LAPACK_COL_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float, float, cheev, ColMajor, LAPACK_COL_MAJOR) - -EIGEN_LAPACKE_EIG_SELFADJ(double, double, double, dsyev, RowMajor, LAPACK_ROW_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(float, float, float, ssyev, RowMajor, LAPACK_ROW_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev, RowMajor, LAPACK_ROW_MAJOR) -EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float, float, cheev, RowMajor, LAPACK_ROW_MAJOR) +EIGEN_LAPACKE_EIG_SELFADJ(double, double, double, dsyev) +EIGEN_LAPACKE_EIG_SELFADJ(float, float, float, ssyev) +EIGEN_LAPACKE_EIG_SELFADJ(dcomplex, lapack_complex_double, double, zheev) +EIGEN_LAPACKE_EIG_SELFADJ(scomplex, lapack_complex_float, float, cheev) } // end namespace Eigen diff --git a/eigenlib/Eigen/src/Geometry/AngleAxis.h b/eigenlib/Eigen/src/Geometry/AngleAxis.h index 0af3c1b0..83ee1be4 100644 --- a/eigenlib/Eigen/src/Geometry/AngleAxis.h +++ b/eigenlib/Eigen/src/Geometry/AngleAxis.h @@ -178,7 +178,7 @@ EIGEN_DEVICE_FUNC AngleAxis& AngleAxis::operator=(const Quaterni if (n != Scalar(0)) { m_angle = Scalar(2)*atan2(n, abs(q.w())); - if(q.w() < 0) + if(q.w() < Scalar(0)) n = -n; m_axis = q.vec() / n; } diff --git a/eigenlib/Eigen/src/Geometry/Quaternion.h b/eigenlib/Eigen/src/Geometry/Quaternion.h index f6ef1bcf..b8182065 100644 --- a/eigenlib/Eigen/src/Geometry/Quaternion.h +++ b/eigenlib/Eigen/src/Geometry/Quaternion.h @@ -43,6 +43,11 @@ class QuaternionBase : public RotationBase typedef typename internal::traits::Scalar Scalar; typedef typename NumTraits::Real RealScalar; typedef typename internal::traits::Coefficients Coefficients; + typedef typename Coefficients::CoeffReturnType CoeffReturnType; + typedef typename internal::conditional::Flags&LvalueBit), + Scalar&, CoeffReturnType>::type NonConstCoeffReturnType; + + enum { Flags = Eigen::internal::traits::Flags }; @@ -58,22 +63,22 @@ class QuaternionBase : public RotationBase /** \returns the \c x coefficient */ - EIGEN_DEVICE_FUNC inline Scalar x() const { return this->derived().coeffs().coeff(0); } + EIGEN_DEVICE_FUNC inline CoeffReturnType x() const { return this->derived().coeffs().coeff(0); } /** \returns the \c y coefficient */ - EIGEN_DEVICE_FUNC inline Scalar y() const { return this->derived().coeffs().coeff(1); } + EIGEN_DEVICE_FUNC inline CoeffReturnType y() const { return this->derived().coeffs().coeff(1); } /** \returns the \c z coefficient */ - EIGEN_DEVICE_FUNC inline Scalar z() const { return this->derived().coeffs().coeff(2); } + EIGEN_DEVICE_FUNC inline CoeffReturnType z() const { return this->derived().coeffs().coeff(2); } /** \returns the \c w coefficient */ - EIGEN_DEVICE_FUNC inline Scalar w() const { return this->derived().coeffs().coeff(3); } + EIGEN_DEVICE_FUNC inline CoeffReturnType w() const { return this->derived().coeffs().coeff(3); } - /** \returns a reference to the \c x coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& x() { return this->derived().coeffs().coeffRef(0); } - /** \returns a reference to the \c y coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& y() { return this->derived().coeffs().coeffRef(1); } - /** \returns a reference to the \c z coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& z() { return this->derived().coeffs().coeffRef(2); } - /** \returns a reference to the \c w coefficient */ - EIGEN_DEVICE_FUNC inline Scalar& w() { return this->derived().coeffs().coeffRef(3); } + /** \returns a reference to the \c x coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType x() { return this->derived().coeffs().x(); } + /** \returns a reference to the \c y coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType y() { return this->derived().coeffs().y(); } + /** \returns a reference to the \c z coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType z() { return this->derived().coeffs().z(); } + /** \returns a reference to the \c w coefficient (if Derived is a non-const lvalue) */ + EIGEN_DEVICE_FUNC inline NonConstCoeffReturnType w() { return this->derived().coeffs().w(); } /** \returns a read-only vector expression of the imaginary part (x,y,z) */ EIGEN_DEVICE_FUNC inline const VectorBlock vec() const { return coeffs().template head<3>(); } @@ -164,20 +169,38 @@ class QuaternionBase : public RotationBase /** return the result vector of \a v through the rotation*/ EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Vector3 _transformVector(const Vector3& v) const; + #ifdef EIGEN_PARSED_BY_DOXYGEN /** \returns \c *this with scalar type casted to \a NewScalarType * * Note that if \a NewScalarType is equal to the current scalar type of \c *this * then this function smartly returns a const reference to \c *this. */ template - EIGEN_DEVICE_FUNC inline typename internal::cast_return_type >::type cast() const + EIGEN_DEVICE_FUNC inline typename internal::cast_return_type >::type cast() const; + + #else + + template + EIGEN_DEVICE_FUNC inline + typename internal::enable_if::value,const Derived&>::type cast() const { - return typename internal::cast_return_type >::type(derived()); + return derived(); } + template + EIGEN_DEVICE_FUNC inline + typename internal::enable_if::value,Quaternion >::type cast() const + { + return Quaternion(coeffs().template cast()); + } + #endif + #ifdef EIGEN_QUATERNIONBASE_PLUGIN # include EIGEN_QUATERNIONBASE_PLUGIN #endif +protected: + EIGEN_DEFAULT_COPY_CONSTRUCTOR(QuaternionBase) + EIGEN_DEFAULT_EMPTY_CONSTRUCTOR_AND_DESTRUCTOR(QuaternionBase) }; /*************************************************************************** @@ -423,7 +446,7 @@ typedef Map, Aligned> QuaternionMapAlignedd; // Generic Quaternion * Quaternion product // This product can be specialized for a given architecture via the Arch template argument. namespace internal { -template struct quat_product +template struct quat_product { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Quaternion run(const QuaternionBase& a, const QuaternionBase& b){ return Quaternion @@ -446,8 +469,7 @@ QuaternionBase::operator* (const QuaternionBase& other) c EIGEN_STATIC_ASSERT((internal::is_same::value), YOU_MIXED_DIFFERENT_NUMERIC_TYPES__YOU_NEED_TO_USE_THE_CAST_METHOD_OF_MATRIXBASE_TO_CAST_NUMERIC_TYPES_EXPLICITLY) return internal::quat_product::Scalar, - EIGEN_PLAIN_ENUM_MIN(internal::traits::Alignment, internal::traits::Alignment)>::run(*this, other); + typename internal::traits::Scalar>::run(*this, other); } /** \sa operator*(Quaternion) */ @@ -672,7 +694,7 @@ EIGEN_DEVICE_FUNC inline Quaternion::Scalar> // Generic conjugate of a Quaternion namespace internal { -template struct quat_conj +template struct quat_conj { EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE Quaternion run(const QuaternionBase& q){ return Quaternion(q.w(),-q.x(),-q.y(),-q.z()); @@ -691,8 +713,7 @@ EIGEN_DEVICE_FUNC inline Quaternion::Scalar> QuaternionBase::conjugate() const { return internal::quat_conj::Scalar, - internal::traits::Alignment>::run(*this); + typename internal::traits::Scalar>::run(*this); } diff --git a/eigenlib/Eigen/src/Geometry/Scaling.h b/eigenlib/Eigen/src/Geometry/Scaling.h old mode 100755 new mode 100644 index f58ca03d..33eabd81 --- a/eigenlib/Eigen/src/Geometry/Scaling.h +++ b/eigenlib/Eigen/src/Geometry/Scaling.h @@ -14,7 +14,7 @@ namespace Eigen { /** \geometry_module \ingroup Geometry_Module * - * \class Scaling + * \class UniformScaling * * \brief Represents a generic uniform scaling transformation * diff --git a/eigenlib/Eigen/src/Geometry/Transform.h b/eigenlib/Eigen/src/Geometry/Transform.h index 3f31ee45..c21d9e55 100644 --- a/eigenlib/Eigen/src/Geometry/Transform.h +++ b/eigenlib/Eigen/src/Geometry/Transform.h @@ -252,11 +252,11 @@ protected: public: /** Default constructor without initialization of the meaningful coefficients. - * If Mode==Affine, then the last row is set to [0 ... 0 1] */ + * If Mode==Affine or Mode==Isometry, then the last row is set to [0 ... 0 1] */ EIGEN_DEVICE_FUNC inline Transform() { check_template_params(); - internal::transform_make_affine<(int(Mode)==Affine) ? Affine : AffineCompact>::run(m_matrix); + internal::transform_make_affine<(int(Mode)==Affine || int(Mode)==Isometry) ? Affine : AffineCompact>::run(m_matrix); } EIGEN_DEVICE_FUNC inline Transform(const Transform& other) diff --git a/eigenlib/Eigen/src/Geometry/Translation.h b/eigenlib/Eigen/src/Geometry/Translation.h index 51d9a82e..0e99ce68 100644 --- a/eigenlib/Eigen/src/Geometry/Translation.h +++ b/eigenlib/Eigen/src/Geometry/Translation.h @@ -138,12 +138,6 @@ public: /** \returns the inverse translation (opposite) */ Translation inverse() const { return Translation(-m_coeffs); } - Translation& operator=(const Translation& other) - { - m_coeffs = other.m_coeffs; - return *this; - } - static const Translation Identity() { return Translation(VectorType::Zero()); } /** \returns \c *this with scalar type casted to \a NewScalarType diff --git a/eigenlib/Eigen/src/Geometry/Umeyama.h b/eigenlib/Eigen/src/Geometry/Umeyama.h index 7e933fca..6b755008 100644 --- a/eigenlib/Eigen/src/Geometry/Umeyama.h +++ b/eigenlib/Eigen/src/Geometry/Umeyama.h @@ -87,7 +87,7 @@ struct umeyama_transform_matrix_type * \f{align*} * T = \begin{bmatrix} c\mathbf{R} & \mathbf{t} \\ \mathbf{0} & 1 \end{bmatrix} * \f} -* minimizing the resudiual above. This transformation is always returned as an +* minimizing the residual above. This transformation is always returned as an * Eigen::Matrix. */ template diff --git a/eigenlib/Eigen/src/Geometry/arch/Geometry_SSE.h b/eigenlib/Eigen/src/Geometry/arch/Geometry_SSE.h index 1a86ff83..f68cab58 100644 --- a/eigenlib/Eigen/src/Geometry/arch/Geometry_SSE.h +++ b/eigenlib/Eigen/src/Geometry/arch/Geometry_SSE.h @@ -16,17 +16,23 @@ namespace Eigen { namespace internal { template -struct quat_product +struct quat_product { + enum { + AAlignment = traits::Alignment, + BAlignment = traits::Alignment, + ResAlignment = traits >::Alignment + }; static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) { Quaternion res; const __m128 mask = _mm_setr_ps(0.f,0.f,0.f,-0.f); - __m128 a = _a.coeffs().template packet(0); - __m128 b = _b.coeffs().template packet(0); + __m128 a = _a.coeffs().template packet(0); + __m128 b = _b.coeffs().template packet(0); __m128 s1 = _mm_mul_ps(vec4f_swizzle1(a,1,2,0,2),vec4f_swizzle1(b,2,0,1,2)); __m128 s2 = _mm_mul_ps(vec4f_swizzle1(a,3,3,3,1),vec4f_swizzle1(b,0,1,2,1)); - pstore(&res.x(), + pstoret( + &res.x(), _mm_add_ps(_mm_sub_ps(_mm_mul_ps(a,vec4f_swizzle1(b,3,3,3,3)), _mm_mul_ps(vec4f_swizzle1(a,2,0,1,0), vec4f_swizzle1(b,1,2,0,0))), @@ -36,14 +42,17 @@ struct quat_product } }; -template -struct quat_conj +template +struct quat_conj { + enum { + ResAlignment = traits >::Alignment + }; static inline Quaternion run(const QuaternionBase& q) { Quaternion res; const __m128 mask = _mm_setr_ps(-0.f,-0.f,-0.f,0.f); - pstore(&res.x(), _mm_xor_ps(mask, q.coeffs().template packet(0))); + pstoret(&res.x(), _mm_xor_ps(mask, q.coeffs().template packet::Alignment>(0))); return res; } }; @@ -52,6 +61,9 @@ struct quat_conj template struct cross3_impl { + enum { + ResAlignment = traits::type>::Alignment + }; static inline typename plain_matrix_type::type run(const VectorLhs& lhs, const VectorRhs& rhs) { @@ -60,7 +72,7 @@ struct cross3_impl __m128 mul1=_mm_mul_ps(vec4f_swizzle1(a,1,2,0,3),vec4f_swizzle1(b,2,0,1,3)); __m128 mul2=_mm_mul_ps(vec4f_swizzle1(a,2,0,1,3),vec4f_swizzle1(b,1,2,0,3)); typename plain_matrix_type::type res; - pstore(&res.x(),_mm_sub_ps(mul1,mul2)); + pstoret(&res.x(),_mm_sub_ps(mul1,mul2)); return res; } }; @@ -68,9 +80,14 @@ struct cross3_impl -template -struct quat_product +template +struct quat_product { + enum { + BAlignment = traits::Alignment, + ResAlignment = traits >::Alignment + }; + static inline Quaternion run(const QuaternionBase& _a, const QuaternionBase& _b) { const Packet2d mask = _mm_castsi128_pd(_mm_set_epi32(0x0,0x0,0x80000000,0x0)); @@ -78,8 +95,8 @@ struct quat_product Quaternion res; const double* a = _a.coeffs().data(); - Packet2d b_xy = _b.coeffs().template packet(0); - Packet2d b_zw = _b.coeffs().template packet(2); + Packet2d b_xy = _b.coeffs().template packet(0); + Packet2d b_zw = _b.coeffs().template packet(2); Packet2d a_xx = pset1(a[0]); Packet2d a_yy = pset1(a[1]); Packet2d a_zz = pset1(a[2]); @@ -97,9 +114,9 @@ struct quat_product t2 = psub(pmul(a_zz, b_xy), pmul(a_xx, b_zw)); #ifdef EIGEN_VECTORIZE_SSE3 EIGEN_UNUSED_VARIABLE(mask) - pstore(&res.x(), _mm_addsub_pd(t1, preverse(t2))); + pstoret(&res.x(), _mm_addsub_pd(t1, preverse(t2))); #else - pstore(&res.x(), padd(t1, pxor(mask,preverse(t2)))); + pstoret(&res.x(), padd(t1, pxor(mask,preverse(t2)))); #endif /* @@ -111,25 +128,28 @@ struct quat_product t2 = padd(pmul(a_zz, b_zw), pmul(a_xx, b_xy)); #ifdef EIGEN_VECTORIZE_SSE3 EIGEN_UNUSED_VARIABLE(mask) - pstore(&res.z(), preverse(_mm_addsub_pd(preverse(t1), t2))); + pstoret(&res.z(), preverse(_mm_addsub_pd(preverse(t1), t2))); #else - pstore(&res.z(), psub(t1, pxor(mask,preverse(t2)))); + pstoret(&res.z(), psub(t1, pxor(mask,preverse(t2)))); #endif return res; } }; -template -struct quat_conj +template +struct quat_conj { + enum { + ResAlignment = traits >::Alignment + }; static inline Quaternion run(const QuaternionBase& q) { Quaternion res; const __m128d mask0 = _mm_setr_pd(-0.,-0.); const __m128d mask2 = _mm_setr_pd(-0.,0.); - pstore(&res.x(), _mm_xor_pd(mask0, q.coeffs().template packet(0))); - pstore(&res.z(), _mm_xor_pd(mask2, q.coeffs().template packet(2))); + pstoret(&res.x(), _mm_xor_pd(mask0, q.coeffs().template packet::Alignment>(0))); + pstoret(&res.z(), _mm_xor_pd(mask2, q.coeffs().template packet::Alignment>(2))); return res; } }; diff --git a/eigenlib/Eigen/src/Householder/BlockHouseholder.h b/eigenlib/Eigen/src/Householder/BlockHouseholder.h index 39bf8c83..01a7ed18 100644 --- a/eigenlib/Eigen/src/Householder/BlockHouseholder.h +++ b/eigenlib/Eigen/src/Householder/BlockHouseholder.h @@ -87,7 +87,8 @@ void apply_block_householder_on_the_left(MatrixType& mat, const VectorsType& vec const TriangularView V(vectors); // A -= V T V^* A - Matrix tmp = V.adjoint() * mat; // FIXME add .noalias() once the triangular product can work inplace if(forward) tmp = T.template triangularView() * tmp; diff --git a/eigenlib/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h b/eigenlib/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h index 358444af..f66c846e 100644 --- a/eigenlib/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h +++ b/eigenlib/Eigen/src/IterativeLinearSolvers/BasicPreconditioners.h @@ -152,13 +152,28 @@ class LeastSquareDiagonalPreconditioner : public DiagonalPreconditioner<_Scalar> { // Compute the inverse squared-norm of each column of mat m_invdiag.resize(mat.cols()); - for(Index j=0; j0) - m_invdiag(j) = RealScalar(1)/sum; - else - m_invdiag(j) = RealScalar(1); + m_invdiag.setZero(); + for(Index j=0; jRealScalar(0)) + m_invdiag(j) = RealScalar(1)/numext::real(m_invdiag(j)); + } + else + { + for(Index j=0; jRealScalar(0)) + m_invdiag(j) = RealScalar(1)/sum; + else + m_invdiag(j) = RealScalar(1); + } } Base::m_isInitialized = true; return *this; diff --git a/eigenlib/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h b/eigenlib/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h index 395daa8e..f7ce4713 100644 --- a/eigenlib/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h +++ b/eigenlib/Eigen/src/IterativeLinearSolvers/ConjugateGradient.h @@ -50,7 +50,8 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, tol_error = 0; return; } - RealScalar threshold = tol*tol*rhsNorm2; + const RealScalar considerAsZero = (std::numeric_limits::min)(); + RealScalar threshold = numext::maxi(tol*tol*rhsNorm2,considerAsZero); RealScalar residualNorm2 = residual.squaredNorm(); if (residualNorm2 < threshold) { @@ -58,7 +59,7 @@ void conjugate_gradient(const MatrixType& mat, const Rhs& rhs, Dest& x, tol_error = sqrt(residualNorm2 / rhsNorm2); return; } - + VectorType p(n); p = precond.solve(residual); // initial search direction diff --git a/eigenlib/Eigen/src/Jacobi/Jacobi.h b/eigenlib/Eigen/src/Jacobi/Jacobi.h index d25af8e9..1998c632 100644 --- a/eigenlib/Eigen/src/Jacobi/Jacobi.h +++ b/eigenlib/Eigen/src/Jacobi/Jacobi.h @@ -65,11 +65,11 @@ template class JacobiRotation bool makeJacobi(const MatrixBase&, Index p, Index q); bool makeJacobi(const RealScalar& x, const Scalar& y, const RealScalar& z); - void makeGivens(const Scalar& p, const Scalar& q, Scalar* z=0); + void makeGivens(const Scalar& p, const Scalar& q, Scalar* r=0); protected: - void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::true_type); - void makeGivens(const Scalar& p, const Scalar& q, Scalar* z, internal::false_type); + void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::true_type); + void makeGivens(const Scalar& p, const Scalar& q, Scalar* r, internal::false_type); Scalar m_c, m_s; }; @@ -84,7 +84,6 @@ bool JacobiRotation::makeJacobi(const RealScalar& x, const Scalar& y, co { using std::sqrt; using std::abs; - typedef typename NumTraits::Real RealScalar; RealScalar deno = RealScalar(2)*abs(y); if(deno < (std::numeric_limits::min)()) { @@ -133,7 +132,7 @@ inline bool JacobiRotation::makeJacobi(const MatrixBase& m, Ind * \f$ V = \left ( \begin{array}{c} p \\ q \end{array} \right )\f$ yields: * \f$ G^* V = \left ( \begin{array}{c} r \\ 0 \end{array} \right )\f$. * - * The value of \a z is returned if \a z is not null (the default is null). + * The value of \a r is returned if \a r is not null (the default is null). * Also note that G is built such that the cosine is always real. * * Example: \include Jacobi_makeGivens.cpp @@ -146,9 +145,9 @@ inline bool JacobiRotation::makeJacobi(const MatrixBase& m, Ind * \sa MatrixBase::applyOnTheLeft(), MatrixBase::applyOnTheRight() */ template -void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* z) +void JacobiRotation::makeGivens(const Scalar& p, const Scalar& q, Scalar* r) { - makeGivens(p, q, z, typename internal::conditional::IsComplex, internal::true_type, internal::false_type>::type()); + makeGivens(p, q, r, typename internal::conditional::IsComplex, internal::true_type, internal::false_type>::type()); } @@ -298,12 +297,144 @@ inline void MatrixBase::applyOnTheRight(Index p, Index q, const JacobiR } namespace internal { + +template +struct apply_rotation_in_the_plane_selector +{ + static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) + { + for(Index i=0; i +struct apply_rotation_in_the_plane_selector +{ + static inline void run(Scalar *x, Index incrx, Scalar *y, Index incry, Index size, OtherScalar c, OtherScalar s) + { + enum { + PacketSize = packet_traits::size, + OtherPacketSize = packet_traits::size + }; + typedef typename packet_traits::type Packet; + typedef typename packet_traits::type OtherPacket; + + /*** dynamic-size vectorized paths ***/ + if(SizeAtCompileTime == Dynamic && ((incrx==1 && incry==1) || PacketSize == 1)) + { + // both vectors are sequentially stored in memory => vectorization + enum { Peeling = 2 }; + + Index alignedStart = internal::first_default_aligned(y, size); + Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize; + + const OtherPacket pc = pset1(c); + const OtherPacket ps = pset1(s); + conj_helper::IsComplex,false> pcj; + conj_helper pm; + + for(Index i=0; i(px); + Packet yi = pload(py); + pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + px += PacketSize; + py += PacketSize; + } + } + else + { + Index peelingEnd = alignedStart + ((size-alignedStart)/(Peeling*PacketSize))*(Peeling*PacketSize); + for(Index i=alignedStart; i(px); + Packet xi1 = ploadu(px+PacketSize); + Packet yi = pload (py); + Packet yi1 = pload (py+PacketSize); + pstoreu(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstoreu(px+PacketSize, padd(pm.pmul(pc,xi1),pcj.pmul(ps,yi1))); + pstore (py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pm.pmul(ps,xi1))); + px += Peeling*PacketSize; + py += Peeling*PacketSize; + } + if(alignedEnd!=peelingEnd) + { + Packet xi = ploadu(x+peelingEnd); + Packet yi = pload (y+peelingEnd); + pstoreu(x+peelingEnd, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + } + } + + for(Index i=alignedEnd; i0) // FIXME should be compared to the required alignment + { + const OtherPacket pc = pset1(c); + const OtherPacket ps = pset1(s); + conj_helper::IsComplex,false> pcj; + conj_helper pm; + Scalar* EIGEN_RESTRICT px = x; + Scalar* EIGEN_RESTRICT py = y; + for(Index i=0; i(px); + Packet yi = pload(py); + pstore(px, padd(pm.pmul(pc,xi),pcj.pmul(ps,yi))); + pstore(py, psub(pcj.pmul(pc,yi),pm.pmul(ps,xi))); + px += PacketSize; + py += PacketSize; + } + } + + /*** non-vectorized path ***/ + else + { + apply_rotation_in_the_plane_selector::run(x,incrx,y,incry,size,c,s); + } + } +}; + template void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x, DenseBase& xpr_y, const JacobiRotation& j) { typedef typename VectorX::Scalar Scalar; - enum { PacketSize = packet_traits::size }; - typedef typename packet_traits::type Packet; + const bool Vectorizable = (VectorX::Flags & VectorY::Flags & PacketAccessBit) + && (int(packet_traits::size) == int(packet_traits::size)); + eigen_assert(xpr_x.size() == xpr_y.size()); Index size = xpr_x.size(); Index incrx = xpr_x.derived().innerStride(); @@ -317,113 +448,11 @@ void /*EIGEN_DONT_INLINE*/ apply_rotation_in_the_plane(DenseBase& xpr_x if (c==OtherScalar(1) && s==OtherScalar(0)) return; - /*** dynamic-size vectorized paths ***/ - - if(VectorX::SizeAtCompileTime == Dynamic && - (VectorX::Flags & VectorY::Flags & PacketAccessBit) && - ((incrx==1 && incry==1) || PacketSize == 1)) - { - // both vectors are sequentially stored in memory => vectorization - enum { Peeling = 2 }; - - Index alignedStart = internal::first_default_aligned(y, size); - Index alignedEnd = alignedStart + ((size-alignedStart)/PacketSize)*PacketSize; - - const Packet pc = pset1(c); - const Packet ps = pset1(s); - conj_helper::IsComplex,false> pcj; - - for(Index i=0; i(px); - Packet yi = pload(py); - pstore(px, padd(pmul(pc,xi),pcj.pmul(ps,yi))); - pstore(py, psub(pcj.pmul(pc,yi),pmul(ps,xi))); - px += PacketSize; - py += PacketSize; - } - } - else - { - Index peelingEnd = alignedStart + ((size-alignedStart)/(Peeling*PacketSize))*(Peeling*PacketSize); - for(Index i=alignedStart; i(px); - Packet xi1 = ploadu(px+PacketSize); - Packet yi = pload (py); - Packet yi1 = pload (py+PacketSize); - pstoreu(px, padd(pmul(pc,xi),pcj.pmul(ps,yi))); - pstoreu(px+PacketSize, padd(pmul(pc,xi1),pcj.pmul(ps,yi1))); - pstore (py, psub(pcj.pmul(pc,yi),pmul(ps,xi))); - pstore (py+PacketSize, psub(pcj.pmul(pc,yi1),pmul(ps,xi1))); - px += Peeling*PacketSize; - py += Peeling*PacketSize; - } - if(alignedEnd!=peelingEnd) - { - Packet xi = ploadu(x+peelingEnd); - Packet yi = pload (y+peelingEnd); - pstoreu(x+peelingEnd, padd(pmul(pc,xi),pcj.pmul(ps,yi))); - pstore (y+peelingEnd, psub(pcj.pmul(pc,yi),pmul(ps,xi))); - } - } - - for(Index i=alignedEnd; i::Alignment, evaluator::Alignment)>0)) // FIXME should be compared to the required alignment - { - const Packet pc = pset1(c); - const Packet ps = pset1(s); - conj_helper::IsComplex,false> pcj; - Scalar* EIGEN_RESTRICT px = x; - Scalar* EIGEN_RESTRICT py = y; - for(Index i=0; i(px); - Packet yi = pload(py); - pstore(px, padd(pmul(pc,xi),pcj.pmul(ps,yi))); - pstore(py, psub(pcj.pmul(pc,yi),pmul(ps,xi))); - px += PacketSize; - py += PacketSize; - } - } - - /*** non-vectorized path ***/ - else - { - for(Index i=0; i::Alignment, evaluator::Alignment), + Vectorizable>::run(x,incrx,y,incry,size,c,s); } } // end namespace internal diff --git a/eigenlib/Eigen/src/LU/InverseImpl.h b/eigenlib/Eigen/src/LU/InverseImpl.h index 018f99b5..f49f2336 100644 --- a/eigenlib/Eigen/src/LU/InverseImpl.h +++ b/eigenlib/Eigen/src/LU/InverseImpl.h @@ -404,7 +404,7 @@ inline void MatrixBase::computeInverseWithCheck( const RealScalar& absDeterminantThreshold ) const { - RealScalar determinant; + Scalar determinant; // i'd love to put some static assertions there, but SFINAE means that they have no effect... eigen_assert(rows() == cols()); computeInverseAndDetWithCheck(inverse,determinant,invertible,absDeterminantThreshold); diff --git a/eigenlib/Eigen/src/LU/PartialPivLU.h b/eigenlib/Eigen/src/LU/PartialPivLU.h index d4396188..6b10f39f 100644 --- a/eigenlib/Eigen/src/LU/PartialPivLU.h +++ b/eigenlib/Eigen/src/LU/PartialPivLU.h @@ -519,7 +519,10 @@ void PartialPivLU::compute() // the row permutation is stored as int indices, so just to be sure: eigen_assert(m_lu.rows()::highest()); - m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff(); + if(m_lu.cols()>0) + m_l1_norm = m_lu.cwiseAbs().colwise().sum().maxCoeff(); + else + m_l1_norm = RealScalar(0); eigen_assert(m_lu.rows() == m_lu.cols() && "PartialPivLU is only for square (and moreover invertible) matrices"); const Index size = m_lu.rows(); diff --git a/eigenlib/Eigen/src/LU/arch/Inverse_SSE.h b/eigenlib/Eigen/src/LU/arch/Inverse_SSE.h index ebb64a62..4dce2ef2 100644 --- a/eigenlib/Eigen/src/LU/arch/Inverse_SSE.h +++ b/eigenlib/Eigen/src/LU/arch/Inverse_SSE.h @@ -44,7 +44,7 @@ struct compute_inverse_size4 static void run(const MatrixType& mat, ResultType& result) { ActualMatrixType matrix(mat); - EIGEN_ALIGN16 const unsigned int _Sign_PNNP[4] = { 0x00000000, 0x80000000, 0x80000000, 0x00000000 }; + const Packet4f p4f_sign_PNNP = _mm_castsi128_ps(_mm_set_epi32(0x00000000, 0x80000000, 0x80000000, 0x00000000)); // Load the full matrix into registers __m128 _L1 = matrix.template packet( 0); @@ -139,7 +139,7 @@ struct compute_inverse_size4 iC = _mm_sub_ps(iC, _mm_mul_ps(_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66))); rd = _mm_shuffle_ps(rd,rd,0); - rd = _mm_xor_ps(rd, _mm_load_ps((float*)_Sign_PNNP)); + rd = _mm_xor_ps(rd, p4f_sign_PNNP); // iB = C*|B| - D*B#*A iB = _mm_sub_ps(_mm_mul_ps(C,_mm_shuffle_ps(dB,dB,0)), iB); diff --git a/eigenlib/Eigen/src/OrderingMethods/Eigen_Colamd.h b/eigenlib/Eigen/src/OrderingMethods/Eigen_Colamd.h index 933cd564..da85b4d6 100644 --- a/eigenlib/Eigen/src/OrderingMethods/Eigen_Colamd.h +++ b/eigenlib/Eigen/src/OrderingMethods/Eigen_Colamd.h @@ -1004,7 +1004,7 @@ static IndexType find_ordering /* return the number of garbage collections */ COLAMD_ASSERT (head [min_score] >= COLAMD_EMPTY) ; /* get pivot column from head of minimum degree list */ - while (head [min_score] == COLAMD_EMPTY && min_score < n_col) + while (min_score < n_col && head [min_score] == COLAMD_EMPTY) { min_score++ ; } diff --git a/eigenlib/Eigen/src/PaStiXSupport/PaStiXSupport.h b/eigenlib/Eigen/src/PaStiXSupport/PaStiXSupport.h index d2ebfd7b..160d8a52 100644 --- a/eigenlib/Eigen/src/PaStiXSupport/PaStiXSupport.h +++ b/eigenlib/Eigen/src/PaStiXSupport/PaStiXSupport.h @@ -64,28 +64,28 @@ namespace internal typedef typename _MatrixType::StorageIndex StorageIndex; }; - void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm) + inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, float *vals, int *perm, int * invp, float *x, int nbrhs, int *iparm, double *dparm) { if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; } if (nbrhs == 0) {x = NULL; nbrhs=1;} s_pastix(pastix_data, pastix_comm, n, ptr, idx, vals, perm, invp, x, nbrhs, iparm, dparm); } - void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, double *vals, int *perm, int * invp, double *x, int nbrhs, int *iparm, double *dparm) + inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, double *vals, int *perm, int * invp, double *x, int nbrhs, int *iparm, double *dparm) { if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; } if (nbrhs == 0) {x = NULL; nbrhs=1;} d_pastix(pastix_data, pastix_comm, n, ptr, idx, vals, perm, invp, x, nbrhs, iparm, dparm); } - void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex *vals, int *perm, int * invp, std::complex *x, int nbrhs, int *iparm, double *dparm) + inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex *vals, int *perm, int * invp, std::complex *x, int nbrhs, int *iparm, double *dparm) { if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; } if (nbrhs == 0) {x = NULL; nbrhs=1;} c_pastix(pastix_data, pastix_comm, n, ptr, idx, reinterpret_cast(vals), perm, invp, reinterpret_cast(x), nbrhs, iparm, dparm); } - void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex *vals, int *perm, int * invp, std::complex *x, int nbrhs, int *iparm, double *dparm) + inline void eigen_pastix(pastix_data_t **pastix_data, int pastix_comm, int n, int *ptr, int *idx, std::complex *vals, int *perm, int * invp, std::complex *x, int nbrhs, int *iparm, double *dparm) { if (n == 0) { ptr = NULL; idx = NULL; vals = NULL; } if (nbrhs == 0) {x = NULL; nbrhs=1;} diff --git a/eigenlib/Eigen/src/PardisoSupport/PardisoSupport.h b/eigenlib/Eigen/src/PardisoSupport/PardisoSupport.h index 091c3970..98d0e3f2 100644 --- a/eigenlib/Eigen/src/PardisoSupport/PardisoSupport.h +++ b/eigenlib/Eigen/src/PardisoSupport/PardisoSupport.h @@ -192,7 +192,8 @@ class PardisoImpl : public SparseSolverBase void pardisoInit(int type) { m_type = type; - bool symmetric = std::abs(m_type) < 10; + EIGEN_USING_STD_MATH(abs); + bool symmetric = abs(m_type) < 10; m_iparm[0] = 1; // No solver default m_iparm[1] = 2; // use Metis for the ordering m_iparm[2] = 0; // Reserved. Set to zero. (??Numbers of processors, value of OMP_NUM_THREADS??) diff --git a/eigenlib/Eigen/src/QR/ColPivHouseholderQR.h b/eigenlib/Eigen/src/QR/ColPivHouseholderQR.h index 0e47c833..a7b47d55 100644 --- a/eigenlib/Eigen/src/QR/ColPivHouseholderQR.h +++ b/eigenlib/Eigen/src/QR/ColPivHouseholderQR.h @@ -506,8 +506,8 @@ void ColPivHouseholderQR::computeInPlace() m_colNormsUpdated.coeffRef(k) = m_colNormsDirect.coeffRef(k); } - RealScalar threshold_helper = numext::abs2(m_colNormsUpdated.maxCoeff() * NumTraits::epsilon()) / RealScalar(rows); - RealScalar norm_downdate_threshold = numext::sqrt(NumTraits::epsilon()); + RealScalar threshold_helper = numext::abs2(m_colNormsUpdated.maxCoeff() * NumTraits::epsilon()) / RealScalar(rows); + RealScalar norm_downdate_threshold = numext::sqrt(NumTraits::epsilon()); m_nonzero_pivots = size; // the generic case is that in which all pivots are nonzero (invertible case) m_maxpivot = RealScalar(0); @@ -553,12 +553,12 @@ void ColPivHouseholderQR::computeInPlace() // http://www.netlib.org/lapack/lawnspdf/lawn176.pdf // and used in LAPACK routines xGEQPF and xGEQP3. // See lines 278-297 in http://www.netlib.org/lapack/explore-html/dc/df4/sgeqpf_8f_source.html - if (m_colNormsUpdated.coeffRef(j) != 0) { + if (m_colNormsUpdated.coeffRef(j) != RealScalar(0)) { RealScalar temp = abs(m_qr.coeffRef(k, j)) / m_colNormsUpdated.coeffRef(j); temp = (RealScalar(1) + temp) * (RealScalar(1) - temp); - temp = temp < 0 ? 0 : temp; - RealScalar temp2 = temp * numext::abs2(m_colNormsUpdated.coeffRef(j) / - m_colNormsDirect.coeffRef(j)); + temp = temp < RealScalar(0) ? RealScalar(0) : temp; + RealScalar temp2 = temp * numext::abs2(m_colNormsUpdated.coeffRef(j) / + m_colNormsDirect.coeffRef(j)); if (temp2 <= norm_downdate_threshold) { // The updated norm has become too inaccurate so re-compute the column // norm directly. diff --git a/eigenlib/Eigen/src/SVD/BDCSVD.h b/eigenlib/Eigen/src/SVD/BDCSVD.h index 25fca6f4..a5b73f8f 100644 --- a/eigenlib/Eigen/src/SVD/BDCSVD.h +++ b/eigenlib/Eigen/src/SVD/BDCSVD.h @@ -11,7 +11,7 @@ // Copyright (C) 2013 Jean Ceccato // Copyright (C) 2013 Pierre Zoppitelli // Copyright (C) 2013 Jitse Niesen -// Copyright (C) 2014-2016 Gael Guennebaud +// Copyright (C) 2014-2017 Gael Guennebaud // // Source Code Form is subject to the terms of the Mozilla // Public License v. 2.0. If a copy of the MPL was not distributed @@ -77,6 +77,7 @@ public: typedef _MatrixType MatrixType; typedef typename MatrixType::Scalar Scalar; typedef typename NumTraits::Real RealScalar; + typedef typename NumTraits::Literal Literal; enum { RowsAtCompileTime = MatrixType::RowsAtCompileTime, ColsAtCompileTime = MatrixType::ColsAtCompileTime, @@ -259,7 +260,7 @@ BDCSVD& BDCSVD::compute(const MatrixType& matrix, unsign //**** step 0 - Copy the input matrix and apply scaling to reduce over/under-flows RealScalar scale = matrix.cwiseAbs().maxCoeff(); - if(scale==RealScalar(0)) scale = RealScalar(1); + if(scale==Literal(0)) scale = Literal(1); MatrixX copy; if (m_isTranspose) copy = matrix.adjoint()/scale; else copy = matrix/scale; @@ -351,13 +352,13 @@ void BDCSVD::structured_update(Block A, co Index k1=0, k2=0; for(Index j=0; j::divide (Index firstCol, Index lastCol, Index firstRowW, l = m_naiveU.row(1).segment(firstCol, k); f = m_naiveU.row(0).segment(firstCol + k + 1, n - k - 1); } - if (m_compV) m_naiveV(firstRowW+k, firstColW) = 1; + if (m_compV) m_naiveV(firstRowW+k, firstColW) = Literal(1); if (r0::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec ArrayRef col0 = m_computed.col(firstCol).segment(firstCol, n); m_workspace.head(n) = m_computed.block(firstCol, firstCol, n, n).diagonal(); ArrayRef diag = m_workspace.head(n); - diag(0) = 0; + diag(0) = Literal(0); // Allocate space for singular values and vectors singVals.resize(n); @@ -590,7 +591,7 @@ void BDCSVD::computeSVDofM(Index firstCol, Index n, MatrixXr& U, Vec // but others are interleaved and we must ignore them at this stage. // To this end, let's compute a permutation skipping them: Index actual_n = n; - while(actual_n>1 && diag(actual_n-1)==0) --actual_n; + while(actual_n>1 && diag(actual_n-1)==Literal(0)) --actual_n; Index m = 0; // size of the deflated problem for(Index k=0;kconsiderZero) @@ -691,11 +692,13 @@ template typename BDCSVD::RealScalar BDCSVD::secularEq(RealScalar mu, const ArrayRef& col0, const ArrayRef& diag, const IndicesRef &perm, const ArrayRef& diagShifted, RealScalar shift) { Index m = perm.size(); - RealScalar res = 1; + RealScalar res = Literal(1); for(Index i=0; i::computeSingVals(const ArrayRef& col0, const ArrayRef& d { using std::abs; using std::swap; + using std::sqrt; Index n = col0.size(); Index actual_n = n; - while(actual_n>1 && col0(actual_n-1)==0) --actual_n; + // Note that here actual_n is computed based on col0(i)==0 instead of diag(i)==0 as above + // because 1) we have diag(i)==0 => col0(i)==0 and 2) if col0(i)==0, then diag(i) is already a singular value. + while(actual_n>1 && col0(actual_n-1)==Literal(0)) --actual_n; for (Index k = 0; k < n; ++k) { - if (col0(k) == 0 || actual_n==1) + if (col0(k) == Literal(0) || actual_n==1) { // if col0(k) == 0, then entry is deflated, so singular value is on diagonal // if actual_n==1, then the deflated problem is already diagonalized singVals(k) = k==0 ? col0(0) : diag(k); - mus(k) = 0; + mus(k) = Literal(0); shifts(k) = k==0 ? col0(0) : diag(k); continue; } @@ -731,15 +737,17 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d right = (diag(actual_n-1) + col0.matrix().norm()); else { - // Skip deflated singular values + // Skip deflated singular values, + // recall that at this stage we assume that z[j]!=0 and all entries for which z[j]==0 have been put aside. + // This should be equivalent to using perm[] Index l = k+1; - while(col0(l)==0) { ++l; eigen_internal_assert(l::computeSingVals(const ArrayRef& col0, const ArrayRef& d << " " << secularEq(0.8*(left+right), col0, diag, perm, diag, 0) << " " << secularEq(0.9*(left+right), col0, diag, perm, diag, 0) << "\n"; #endif - RealScalar shift = (k == actual_n-1 || fMid > 0) ? left : right; + RealScalar shift = (k == actual_n-1 || fMid > Literal(0)) ? left : right; // measure everything relative to shift Map diagShifted(m_workspace.data()+4*n, n); diagShifted = diag - shift; + + if(k!=actual_n-1) + { + // check that after the shift, f(mid) is still negative: + RealScalar midShifted = (right - left) / RealScalar(2); + if(shift==right) + midShifted = -midShifted; + RealScalar fMidShifted = secularEq(midShifted, col0, diag, perm, diagShifted, shift); + if(fMidShifted>0) + { + // fMid was erroneous, fix it: + shift = fMidShifted > Literal(0) ? left : right; + diagShifted = diag - shift; + } + } // initial guess RealScalar muPrev, muCur; @@ -785,13 +808,13 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d // rational interpolation: fit a function of the form a / mu + b through the two previous // iterates and use its zero to compute the next iterate - bool useBisection = fPrev*fCur>0; - while (fCur!=0 && abs(muCur - muPrev) > 8 * NumTraits::epsilon() * numext::maxi(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits::epsilon() && !useBisection) + bool useBisection = fPrev*fCur>Literal(0); + while (fCur!=Literal(0) && abs(muCur - muPrev) > Literal(8) * NumTraits::epsilon() * numext::maxi(abs(muCur), abs(muPrev)) && abs(fCur - fPrev)>NumTraits::epsilon() && !useBisection) { ++m_numIters; // Find a and b such that the function f(mu) = a / mu + b matches the current and previous samples. - RealScalar a = (fCur - fPrev) / (1/muCur - 1/muPrev); + RealScalar a = (fCur - fPrev) / (Literal(1)/muCur - Literal(1)/muPrev); RealScalar b = fCur - a / muCur; // And find mu such that f(mu)==0: RealScalar muZero = -a/b; @@ -803,8 +826,8 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d fCur = fZero; - if (shift == left && (muCur < 0 || muCur > right - left)) useBisection = true; - if (shift == right && (muCur < -(right - left) || muCur > 0)) useBisection = true; + if (shift == left && (muCur < Literal(0) || muCur > right - left)) useBisection = true; + if (shift == right && (muCur < -(right - left) || muCur > Literal(0))) useBisection = true; if (abs(fCur)>abs(fPrev)) useBisection = true; } @@ -817,23 +840,33 @@ void BDCSVD::computeSingVals(const ArrayRef& col0, const ArrayRef& d RealScalar leftShifted, rightShifted; if (shift == left) { - leftShifted = (std::numeric_limits::min)(); + // to avoid overflow, we must have mu > max(real_min, |z(k)|/sqrt(real_max)), + // the factor 2 is to be more conservative + leftShifted = numext::maxi( (std::numeric_limits::min)(), Literal(2) * abs(col0(k)) / sqrt((std::numeric_limits::max)()) ); + + // check that we did it right: + eigen_internal_assert( (numext::isfinite)( (col0(k)/leftShifted)*(col0(k)/(diag(k)+shift+leftShifted)) ) ); // I don't understand why the case k==0 would be special there: - // if (k == 0) rightShifted = right - left; else - rightShifted = (k==actual_n-1) ? right : ((right - left) * RealScalar(0.6)); // theoretically we can take 0.5, but let's be safe + // if (k == 0) rightShifted = right - left; else + rightShifted = (k==actual_n-1) ? right : ((right - left) * RealScalar(0.51)); // theoretically we can take 0.5, but let's be safe } else { - leftShifted = -(right - left) * RealScalar(0.6); - rightShifted = -(std::numeric_limits::min)(); + leftShifted = -(right - left) * RealScalar(0.51); + if(k+1( (std::numeric_limits::min)(), abs(col0(k+1)) / sqrt((std::numeric_limits::max)()) ); + else + rightShifted = -(std::numeric_limits::min)(); } RealScalar fLeft = secularEq(leftShifted, col0, diag, perm, diagShifted, shift); + eigen_internal_assert(fLeft::computeSingVals(const ArrayRef& col0, const ArrayRef& d std::cout << k << " : " << fLeft << " * " << fRight << " == " << fLeft * fRight << " ; " << left << " - " << right << " -> " << leftShifted << " " << rightShifted << " shift=" << shift << "\n"; } #endif - eigen_internal_assert(fLeft * fRight < 0); - - while (rightShifted - leftShifted > 2 * NumTraits::epsilon() * numext::maxi(abs(leftShifted), abs(rightShifted))) - { - RealScalar midShifted = (leftShifted + rightShifted) / 2; - fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift); - if (fLeft * fMid < 0) - { - rightShifted = midShifted; - } - else - { - leftShifted = midShifted; - fLeft = fMid; - } - } + eigen_internal_assert(fLeft * fRight < Literal(0)); - muCur = (leftShifted + rightShifted) / 2; + if(fLeft Literal(2) * NumTraits::epsilon() * numext::maxi(abs(leftShifted), abs(rightShifted))) + { + RealScalar midShifted = (leftShifted + rightShifted) / Literal(2); + fMid = secularEq(midShifted, col0, diag, perm, diagShifted, shift); + eigen_internal_assert((numext::isfinite)(fMid)); + + if (fLeft * fMid < Literal(0)) + { + rightShifted = midShifted; + } + else + { + leftShifted = midShifted; + fLeft = fMid; + } + } + muCur = (leftShifted + rightShifted) / Literal(2); + } + else + { + // We have a problem as shifting on the left or right give either a positive or negative value + // at the middle of [left,right]... + // Instead fo abbording or entering an infinite loop, + // let's just use the middle as the estimated zero-crossing: + muCur = (right - left) * RealScalar(0.5); + if(shift == right) + muCur = -muCur; + } } singVals[k] = shift + muCur; @@ -892,8 +939,8 @@ void BDCSVD::perturbCol0 // The offset permits to skip deflated entries while computing zhat for (Index k = 0; k < n; ++k) { - if (col0(k) == 0) // deflated - zhat(k) = 0; + if (col0(k) == Literal(0)) // deflated + zhat(k) = Literal(0); else { // see equation (3.6) @@ -908,7 +955,7 @@ void BDCSVD::perturbCol0 Index j = i 0.9 ) + if(i!=k && numext::abs(((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) - 1) > 0.9 ) std::cout << " " << ((singVals(j)+dk)*(mus(j)+(shifts(j)-dk)))/((diag(i)+dk)*(diag(i)-dk)) << " == (" << (singVals(j)+dk) << " * " << (mus(j)+(shifts(j)-dk)) << ") / (" << (diag(i)+dk) << " * " << (diag(i)-dk) << ")\n"; #endif @@ -918,7 +965,7 @@ void BDCSVD::perturbCol0 std::cout << "zhat(" << k << ") = sqrt( " << prod << ") ; " << (singVals(last) + dk) << " * " << mus(last) + shifts(last) << " - " << dk << "\n"; #endif RealScalar tmp = sqrt(prod); - zhat(k) = col0(k) > 0 ? tmp : -tmp; + zhat(k) = col0(k) > Literal(0) ? RealScalar(tmp) : RealScalar(-tmp); } } } @@ -934,7 +981,7 @@ void BDCSVD::computeSingVecs for (Index k = 0; k < n; ++k) { - if (zhat(k) == 0) + if (zhat(k) == Literal(0)) { U.col(k) = VectorType::Unit(n+1, k); if (m_compV) V.col(k) = VectorType::Unit(n, k); @@ -947,7 +994,7 @@ void BDCSVD::computeSingVecs Index i = perm(l); U(i,k) = zhat(i)/(((diag(i) - shifts(k)) - mus(k)) )/( (diag(i) + singVals[k])); } - U(n,k) = 0; + U(n,k) = Literal(0); U.col(k).normalize(); if (m_compV) @@ -958,7 +1005,7 @@ void BDCSVD::computeSingVecs Index i = perm(l); V(i,k) = diag(i) * zhat(i) / (((diag(i) - shifts(k)) - mus(k)) )/( (diag(i) + singVals[k])); } - V(0,k) = -1; + V(0,k) = Literal(-1); V.col(k).normalize(); } } @@ -979,15 +1026,15 @@ void BDCSVD::deflation43(Index firstCol, Index shift, Index i, Index Index start = firstCol + shift; RealScalar c = m_computed(start, start); RealScalar s = m_computed(start+i, start); - RealScalar r = sqrt(numext::abs2(c) + numext::abs2(s)); - if (r == 0) + RealScalar r = numext::hypot(c,s); + if (r == Literal(0)) { - m_computed(start+i, start+i) = 0; + m_computed(start+i, start+i) = Literal(0); return; } m_computed(start,start) = r; - m_computed(start+i, start) = 0; - m_computed(start+i, start+i) = 0; + m_computed(start+i, start) = Literal(0); + m_computed(start+i, start+i) = Literal(0); JacobiRotation J(c/r,-s/r); if (m_compU) m_naiveU.middleRows(firstCol, size+1).applyOnTheRight(firstCol, firstCol+i, J); @@ -1020,7 +1067,7 @@ void BDCSVD::deflation44(Index firstColu , Index firstColm, Index fi << m_computed(firstColm + i+1, firstColm+i+1) << " " << m_computed(firstColm + i+2, firstColm+i+2) << "\n"; #endif - if (r==0) + if (r==Literal(0)) { m_computed(firstColm + i, firstColm + i) = m_computed(firstColm + j, firstColm + j); return; @@ -1029,7 +1076,7 @@ void BDCSVD::deflation44(Index firstColu , Index firstColm, Index fi s/=r; m_computed(firstColm + i, firstColm) = r; m_computed(firstColm + j, firstColm + j) = m_computed(firstColm + i, firstColm + i); - m_computed(firstColm + j, firstColm) = 0; + m_computed(firstColm + j, firstColm) = Literal(0); JacobiRotation J(c,-s); if (m_compU) m_naiveU.middleRows(firstColu, size+1).applyOnTheRight(firstColu + i, firstColu + j, J); @@ -1053,7 +1100,7 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index const RealScalar considerZero = (std::numeric_limits::min)(); RealScalar maxDiag = diag.tail((std::max)(Index(1),length-1)).cwiseAbs().maxCoeff(); RealScalar epsilon_strict = numext::maxi(considerZero,NumTraits::epsilon() * maxDiag); - RealScalar epsilon_coarse = 8 * NumTraits::epsilon() * numext::maxi(col0.cwiseAbs().maxCoeff(), maxDiag); + RealScalar epsilon_coarse = Literal(8) * NumTraits::epsilon() * numext::maxi(col0.cwiseAbs().maxCoeff(), maxDiag); #ifdef EIGEN_BDCSVD_SANITY_CHECKS assert(m_naiveU.allFinite()); @@ -1081,7 +1128,7 @@ void BDCSVD::deflation(Index firstCol, Index lastCol, Index k, Index #ifdef EIGEN_BDCSVD_DEBUG_VERBOSE std::cout << "deflation 4.2, set z(" << i << ") to zero because " << abs(col0(i)) << " < " << epsilon_strict << " (diag(" << i << ")=" << diag(i) << ")\n"; #endif - col0(i) = 0; + col0(i) = Literal(0); } //condition 4.3 diff --git a/eigenlib/Eigen/src/SVD/JacobiSVD.h b/eigenlib/Eigen/src/SVD/JacobiSVD.h index e0cfb628..43488b1e 100644 --- a/eigenlib/Eigen/src/SVD/JacobiSVD.h +++ b/eigenlib/Eigen/src/SVD/JacobiSVD.h @@ -112,9 +112,11 @@ public: ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - Options = MatrixType::Options + TrOptions = RowsAtCompileTime==1 ? (MatrixType::Options & ~(RowMajor)) + : ColsAtCompileTime==1 ? (MatrixType::Options | RowMajor) + : MatrixType::Options }; - typedef Matrix + typedef Matrix TransposeTypeWithSameStorageOrder; void allocate(const JacobiSVD& svd) @@ -200,10 +202,12 @@ public: ColsAtCompileTime = MatrixType::ColsAtCompileTime, MaxRowsAtCompileTime = MatrixType::MaxRowsAtCompileTime, MaxColsAtCompileTime = MatrixType::MaxColsAtCompileTime, - Options = MatrixType::Options + TrOptions = RowsAtCompileTime==1 ? (MatrixType::Options & ~(RowMajor)) + : ColsAtCompileTime==1 ? (MatrixType::Options | RowMajor) + : MatrixType::Options }; - typedef Matrix + typedef Matrix TransposeTypeWithSameStorageOrder; void allocate(const JacobiSVD& svd) diff --git a/eigenlib/Eigen/src/SVD/JacobiSVD_LAPACKE.h b/eigenlib/Eigen/src/SVD/JacobiSVD_LAPACKE.h index 50272154..ff0516f6 100644 --- a/eigenlib/Eigen/src/SVD/JacobiSVD_LAPACKE.h +++ b/eigenlib/Eigen/src/SVD/JacobiSVD_LAPACKE.h @@ -61,9 +61,10 @@ JacobiSVD, ColPiv u = (LAPACKE_TYPE*)m_matrixU.data(); \ } else { ldu=1; u=&dummy; }\ MatrixType localV; \ - ldvt = (m_computeFullV) ? internal::convert_index(m_cols) : (m_computeThinV) ? internal::convert_index(m_diagSize) : 1; \ + lapack_int vt_rows = (m_computeFullV) ? internal::convert_index(m_cols) : (m_computeThinV) ? internal::convert_index(m_diagSize) : 1; \ if (computeV()) { \ - localV.resize(ldvt, m_cols); \ + localV.resize(vt_rows, m_cols); \ + ldvt = internal::convert_index(localV.outerStride()); \ vt = (LAPACKE_TYPE*)localV.data(); \ } else { ldvt=1; vt=&dummy; }\ Matrix superb; superb.resize(m_diagSize, 1); \ diff --git a/eigenlib/Eigen/src/SVD/SVDBase.h b/eigenlib/Eigen/src/SVD/SVDBase.h index cc90a3b7..53da2848 100644 --- a/eigenlib/Eigen/src/SVD/SVDBase.h +++ b/eigenlib/Eigen/src/SVD/SVDBase.h @@ -180,8 +180,10 @@ public: RealScalar threshold() const { eigen_assert(m_isInitialized || m_usePrescribedThreshold); + // this temporary is needed to workaround a MSVC issue + Index diagSize = (std::max)(1,m_diagSize); return m_usePrescribedThreshold ? m_prescribedThreshold - : (std::max)(1,m_diagSize)*NumTraits::epsilon(); + : RealScalar(diagSize)*NumTraits::epsilon(); } /** \returns true if \a U (full or thin) is asked for in this SVD decomposition */ diff --git a/eigenlib/Eigen/src/SVD/UpperBidiagonalization.h b/eigenlib/Eigen/src/SVD/UpperBidiagonalization.h index 0b146089..11ac847e 100644 --- a/eigenlib/Eigen/src/SVD/UpperBidiagonalization.h +++ b/eigenlib/Eigen/src/SVD/UpperBidiagonalization.h @@ -159,6 +159,8 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, traits::Flags & RowMajorBit> > Y) { typedef typename MatrixType::Scalar Scalar; + typedef typename MatrixType::RealScalar RealScalar; + typedef typename NumTraits::Literal Literal; enum { StorageOrder = traits::Flags & RowMajorBit }; typedef InnerStride ColInnerStride; typedef InnerStride RowInnerStride; @@ -263,7 +265,7 @@ void upperbidiagonalization_blocked_helper(MatrixType& A, SubMatType A10( A.block(bs,0, brows-bs,bs) ); SubMatType A01( A.block(0,bs, bs,bcols-bs) ); Scalar tmp = A01(bs-1,0); - A01(bs-1,0) = 1; + A01(bs-1,0) = Literal(1); A11.noalias() -= A10 * Y.topLeftCorner(bcols,bs).bottomRows(bcols-bs).adjoint(); A11.noalias() -= X.topLeftCorner(brows,bs).bottomRows(brows-bs) * A01; A01(bs-1,0) = tmp; diff --git a/eigenlib/Eigen/src/SparseCholesky/SimplicialCholesky.h b/eigenlib/Eigen/src/SparseCholesky/SimplicialCholesky.h index 2907f652..369e6804 100644 --- a/eigenlib/Eigen/src/SparseCholesky/SimplicialCholesky.h +++ b/eigenlib/Eigen/src/SparseCholesky/SimplicialCholesky.h @@ -608,7 +608,7 @@ public: } if(Base::m_diag.size()>0) - dest = Base::m_diag.asDiagonal().inverse() * dest; + dest = Base::m_diag.real().asDiagonal().inverse() * dest; if (Base::m_matrix.nonZeros()>0) // otherwise I==I { diff --git a/eigenlib/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h b/eigenlib/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h index 31e06995..7b6183d0 100644 --- a/eigenlib/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h +++ b/eigenlib/Eigen/src/SparseCholesky/SimplicialCholesky_impl.h @@ -156,7 +156,7 @@ void SimplicialCholeskyBase::factorize_preordered(const CholMatrixType& /* the nonzero entry L(k,i) */ Scalar l_ki; if(DoLDLT) - l_ki = yi / m_diag[i]; + l_ki = yi / numext::real(m_diag[i]); else yi = l_ki = yi / Lx[Lp[i]]; diff --git a/eigenlib/Eigen/src/SparseCore/AmbiVector.h b/eigenlib/Eigen/src/SparseCore/AmbiVector.h index 1233e164..2cb7747c 100644 --- a/eigenlib/Eigen/src/SparseCore/AmbiVector.h +++ b/eigenlib/Eigen/src/SparseCore/AmbiVector.h @@ -28,7 +28,7 @@ class AmbiVector typedef typename NumTraits::Real RealScalar; explicit AmbiVector(Index size) - : m_buffer(0), m_zero(0), m_size(0), m_allocatedSize(0), m_allocatedElements(0), m_mode(-1) + : m_buffer(0), m_zero(0), m_size(0), m_end(0), m_allocatedSize(0), m_allocatedElements(0), m_mode(-1) { resize(size); } @@ -94,7 +94,7 @@ class AmbiVector Index allocSize = m_allocatedElements * sizeof(ListEl); allocSize = (allocSize + sizeof(Scalar) - 1)/sizeof(Scalar); Scalar* newBuffer = new Scalar[allocSize]; - memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl)); + std::memcpy(newBuffer, m_buffer, copyElements * sizeof(ListEl)); delete[] m_buffer; m_buffer = newBuffer; } @@ -147,7 +147,8 @@ template void AmbiVector<_Scalar,_StorageIndex>::init(int mode) { m_mode = mode; - if (m_mode==IsSparse) + // This is only necessary in sparse mode, but we set these unconditionally to avoid some maybe-uninitialized warnings + // if (m_mode==IsSparse) { m_llSize = 0; m_llStart = -1; @@ -336,7 +337,7 @@ class AmbiVector<_Scalar,_StorageIndex>::Iterator { do { ++m_cachedIndex; - } while (m_cachedIndex::Iterator ListEl* EIGEN_RESTRICT llElements = reinterpret_cast(m_vector.m_buffer); do { m_currentEl = llElements[m_currentEl].next; - } while (m_currentEl>=0 && abs(llElements[m_currentEl].value)=0 && abs(llElements[m_currentEl].value)<=m_epsilon); if (m_currentEl<0) { m_cachedIndex = -1; @@ -363,9 +364,9 @@ class AmbiVector<_Scalar,_StorageIndex>::Iterator protected: const AmbiVector& m_vector; // the target vector - StorageIndex m_currentEl; // the current element in sparse/linked-list mode + StorageIndex m_currentEl; // the current element in sparse/linked-list mode RealScalar m_epsilon; // epsilon used to prune zero coefficients - StorageIndex m_cachedIndex; // current coordinate + StorageIndex m_cachedIndex; // current coordinate Scalar m_cachedValue; // current value bool m_isDense; // mode of the vector }; diff --git a/eigenlib/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h b/eigenlib/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h index 492eb0a2..9db119b6 100644 --- a/eigenlib/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h +++ b/eigenlib/Eigen/src/SparseCore/ConservativeSparseSparseProduct.h @@ -17,7 +17,9 @@ namespace internal { template static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res, bool sortedInsertion = false) { - typedef typename remove_all::type::Scalar Scalar; + typedef typename remove_all::type::Scalar LhsScalar; + typedef typename remove_all::type::Scalar RhsScalar; + typedef typename remove_all::type::Scalar ResScalar; // make sure to call innerSize/outerSize since we fake the storage order. Index rows = lhs.innerSize(); @@ -25,7 +27,7 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r eigen_assert(lhs.outerSize() == rhs.innerSize()); ei_declare_aligned_stack_constructed_variable(bool, mask, rows, 0); - ei_declare_aligned_stack_constructed_variable(Scalar, values, rows, 0); + ei_declare_aligned_stack_constructed_variable(ResScalar, values, rows, 0); ei_declare_aligned_stack_constructed_variable(Index, indices, rows, 0); std::memset(mask,0,sizeof(bool)*rows); @@ -51,12 +53,12 @@ static void conservative_sparse_sparse_product_impl(const Lhs& lhs, const Rhs& r Index nnz = 0; for (typename evaluator::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt) { - Scalar y = rhsIt.value(); + RhsScalar y = rhsIt.value(); Index k = rhsIt.index(); for (typename evaluator::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt) { Index i = lhsIt.index(); - Scalar x = lhsIt.value(); + LhsScalar x = lhsIt.value(); if(!mask[i]) { mask[i] = true; @@ -166,11 +168,12 @@ struct conservative_sparse_sparse_product_selector RowMajorMatrix; - RowMajorMatrix rhsRow = rhs; - RowMajorMatrix resRow(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl(rhsRow, lhs, resRow); - res = resRow; + typedef SparseMatrix RowMajorRhs; + typedef SparseMatrix RowMajorRes; + RowMajorRhs rhsRow = rhs; + RowMajorRes resRow(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl(rhsRow, lhs, resRow); + res = resRow; } }; @@ -179,10 +182,11 @@ struct conservative_sparse_sparse_product_selector RowMajorMatrix; - RowMajorMatrix lhsRow = lhs; - RowMajorMatrix resRow(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl(rhs, lhsRow, resRow); + typedef SparseMatrix RowMajorLhs; + typedef SparseMatrix RowMajorRes; + RowMajorLhs lhsRow = lhs; + RowMajorRes resRow(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl(rhs, lhsRow, resRow); res = resRow; } }; @@ -219,10 +223,11 @@ struct conservative_sparse_sparse_product_selector ColMajorMatrix; - ColMajorMatrix lhsCol = lhs; - ColMajorMatrix resCol(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl(lhsCol, rhs, resCol); + typedef SparseMatrix ColMajorLhs; + typedef SparseMatrix ColMajorRes; + ColMajorLhs lhsCol = lhs; + ColMajorRes resCol(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl(lhsCol, rhs, resCol); res = resCol; } }; @@ -232,10 +237,11 @@ struct conservative_sparse_sparse_product_selector ColMajorMatrix; - ColMajorMatrix rhsCol = rhs; - ColMajorMatrix resCol(lhs.rows(), rhs.cols()); - internal::conservative_sparse_sparse_product_impl(lhs, rhsCol, resCol); + typedef SparseMatrix ColMajorRhs; + typedef SparseMatrix ColMajorRes; + ColMajorRhs rhsCol = rhs; + ColMajorRes resCol(lhs.rows(), rhs.cols()); + internal::conservative_sparse_sparse_product_impl(lhs, rhsCol, resCol); res = resCol; } }; @@ -263,7 +269,8 @@ namespace internal { template static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, ResultType& res) { - typedef typename remove_all::type::Scalar Scalar; + typedef typename remove_all::type::Scalar LhsScalar; + typedef typename remove_all::type::Scalar RhsScalar; Index cols = rhs.outerSize(); eigen_assert(lhs.outerSize() == rhs.innerSize()); @@ -274,12 +281,12 @@ static void sparse_sparse_to_dense_product_impl(const Lhs& lhs, const Rhs& rhs, { for (typename evaluator::InnerIterator rhsIt(rhsEval, j); rhsIt; ++rhsIt) { - Scalar y = rhsIt.value(); + RhsScalar y = rhsIt.value(); Index k = rhsIt.index(); for (typename evaluator::InnerIterator lhsIt(lhsEval, k); lhsIt; ++lhsIt) { Index i = lhsIt.index(); - Scalar x = lhsIt.value(); + LhsScalar x = lhsIt.value(); res.coeffRef(i,j) += x * y; } } @@ -310,9 +317,9 @@ struct sparse_sparse_to_dense_product_selector ColMajorMatrix; - ColMajorMatrix lhsCol(lhs); - internal::sparse_sparse_to_dense_product_impl(lhsCol, rhs, res); + typedef SparseMatrix ColMajorLhs; + ColMajorLhs lhsCol(lhs); + internal::sparse_sparse_to_dense_product_impl(lhsCol, rhs, res); } }; @@ -321,9 +328,9 @@ struct sparse_sparse_to_dense_product_selector ColMajorMatrix; - ColMajorMatrix rhsCol(rhs); - internal::sparse_sparse_to_dense_product_impl(lhs, rhsCol, res); + typedef SparseMatrix ColMajorRhs; + ColMajorRhs rhsCol(rhs); + internal::sparse_sparse_to_dense_product_impl(lhs, rhsCol, res); } }; diff --git a/eigenlib/Eigen/src/SparseCore/SparseAssign.h b/eigenlib/Eigen/src/SparseCore/SparseAssign.h index 83776645..18352a84 100644 --- a/eigenlib/Eigen/src/SparseCore/SparseAssign.h +++ b/eigenlib/Eigen/src/SparseCore/SparseAssign.h @@ -143,10 +143,7 @@ struct Assignment dst.setZero(); internal::evaluator srcEval(src); - Index dstRows = src.rows(); - Index dstCols = src.cols(); - if((dst.rows()!=dstRows) || (dst.cols()!=dstCols)) - dst.resize(dstRows, dstCols); + resize_if_allowed(dst, src, func); internal::evaluator dstEval(dst); const Index outerEvaluationSize = (internal::evaluator::Flags&RowMajorBit) ? src.rows() : src.cols(); diff --git a/eigenlib/Eigen/src/SparseCore/SparseCompressedBase.h b/eigenlib/Eigen/src/SparseCore/SparseCompressedBase.h index e0850795..5ccb4665 100644 --- a/eigenlib/Eigen/src/SparseCore/SparseCompressedBase.h +++ b/eigenlib/Eigen/src/SparseCore/SparseCompressedBase.h @@ -279,11 +279,11 @@ struct evaluator > Flags = Derived::Flags }; - evaluator() : m_matrix(0) + evaluator() : m_matrix(0), m_zero(0) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } - explicit evaluator(const Derived &mat) : m_matrix(&mat) + explicit evaluator(const Derived &mat) : m_matrix(&mat), m_zero(0) { EIGEN_INTERNAL_CHECK_COST_VALUE(CoeffReadCost); } @@ -296,26 +296,42 @@ struct evaluator > operator const Derived&() const { return *m_matrix; } typedef typename DenseCoeffsBase::CoeffReturnType CoeffReturnType; - Scalar coeff(Index row, Index col) const - { return m_matrix->coeff(row,col); } - + const Scalar& coeff(Index row, Index col) const + { + Index p = find(row,col); + + if(p==Dynamic) + return m_zero; + else + return m_matrix->const_cast_derived().valuePtr()[p]; + } + Scalar& coeffRef(Index row, Index col) + { + Index p = find(row,col); + eigen_assert(p!=Dynamic && "written coefficient does not exist"); + return m_matrix->const_cast_derived().valuePtr()[p]; + } + +protected: + + Index find(Index row, Index col) const { eigen_internal_assert(row>=0 && rowrows() && col>=0 && colcols()); - + const Index outer = Derived::IsRowMajor ? row : col; const Index inner = Derived::IsRowMajor ? col : row; Index start = m_matrix->outerIndexPtr()[outer]; Index end = m_matrix->isCompressed() ? m_matrix->outerIndexPtr()[outer+1] : m_matrix->outerIndexPtr()[outer] + m_matrix->innerNonZeroPtr()[outer]; - eigen_assert(end>start && "you are using a non finalized sparse matrix or written coefficient does not exist"); - const Index p = std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner) - - m_matrix->innerIndexPtr(); - eigen_assert((pinnerIndexPtr()[p]==inner) && "written coefficient does not exist"); - return m_matrix->const_cast_derived().valuePtr()[p]; + eigen_assert(end>=start && "you are using a non finalized sparse matrix or written coefficient does not exist"); + const Index p = std::lower_bound(m_matrix->innerIndexPtr()+start, m_matrix->innerIndexPtr()+end,inner) - m_matrix->innerIndexPtr(); + + return ((pinnerIndexPtr()[p]==inner)) ? p : Dynamic; } const Derived *m_matrix; + const Scalar m_zero; }; } diff --git a/eigenlib/Eigen/src/SparseCore/SparseCwiseBinaryOp.h b/eigenlib/Eigen/src/SparseCore/SparseCwiseBinaryOp.h index 526c7121..e315e355 100644 --- a/eigenlib/Eigen/src/SparseCore/SparseCwiseBinaryOp.h +++ b/eigenlib/Eigen/src/SparseCore/SparseCwiseBinaryOp.h @@ -359,6 +359,16 @@ struct binary_evaluator, Lhs, Rhs>, Itera explicit binary_evaluator(const XprType& xpr) : Base(xpr) {} }; +// "sparse ./ dense" +template +struct binary_evaluator, Lhs, Rhs>, IteratorBased, IndexBased> + : sparse_conjunction_evaluator, Lhs, Rhs> > +{ + typedef CwiseBinaryOp, Lhs, Rhs> XprType; + typedef sparse_conjunction_evaluator Base; + explicit binary_evaluator(const XprType& xpr) : Base(xpr) {} +}; + // "sparse && sparse" template struct binary_evaluator, IteratorBased, IteratorBased> diff --git a/eigenlib/Eigen/src/SparseCore/SparseCwiseUnaryOp.h b/eigenlib/Eigen/src/SparseCore/SparseCwiseUnaryOp.h index ea797379..df6c28d2 100644 --- a/eigenlib/Eigen/src/SparseCore/SparseCwiseUnaryOp.h +++ b/eigenlib/Eigen/src/SparseCore/SparseCwiseUnaryOp.h @@ -49,6 +49,7 @@ template class unary_evaluator, IteratorBased>::InnerIterator : public unary_evaluator, IteratorBased>::EvalIterator { + protected: typedef typename XprType::Scalar Scalar; typedef typename unary_evaluator, IteratorBased>::EvalIterator Base; public: @@ -99,6 +100,7 @@ template class unary_evaluator, IteratorBased>::InnerIterator : public unary_evaluator, IteratorBased>::EvalIterator { + protected: typedef typename XprType::Scalar Scalar; typedef typename unary_evaluator, IteratorBased>::EvalIterator Base; public: diff --git a/eigenlib/Eigen/src/SparseCore/SparseDiagonalProduct.h b/eigenlib/Eigen/src/SparseCore/SparseDiagonalProduct.h index e4af49e0..941c03be 100644 --- a/eigenlib/Eigen/src/SparseCore/SparseDiagonalProduct.h +++ b/eigenlib/Eigen/src/SparseCore/SparseDiagonalProduct.h @@ -80,6 +80,8 @@ public: sparse_diagonal_product_evaluator(const SparseXprType &sparseXpr, const DiagonalCoeffType &diagCoeff) : m_sparseXprImpl(sparseXpr), m_diagCoeffImpl(diagCoeff) {} + + Index nonZerosEstimate() const { return m_sparseXprImpl.nonZerosEstimate(); } protected: evaluator m_sparseXprImpl; @@ -121,6 +123,8 @@ struct sparse_diagonal_product_evaluator m_sparseXprEval; diff --git a/eigenlib/Eigen/src/SparseCore/SparseMatrix.h b/eigenlib/Eigen/src/SparseCore/SparseMatrix.h index 529c7a0a..a5396538 100644 --- a/eigenlib/Eigen/src/SparseCore/SparseMatrix.h +++ b/eigenlib/Eigen/src/SparseCore/SparseMatrix.h @@ -327,7 +327,8 @@ class SparseMatrix m_outerIndex[j] = newOuterIndex[j]; m_innerNonZeros[j] = innerNNZ; } - m_outerIndex[m_outerSize] = m_outerIndex[m_outerSize-1] + m_innerNonZeros[m_outerSize-1] + reserveSizes[m_outerSize-1]; + if(m_outerSize>0) + m_outerIndex[m_outerSize] = m_outerIndex[m_outerSize-1] + m_innerNonZeros[m_outerSize-1] + reserveSizes[m_outerSize-1]; m_data.resize(m_outerIndex[m_outerSize]); } @@ -893,7 +894,7 @@ public: Index p = m_outerIndex[outer] + m_innerNonZeros[outer]++; m_data.index(p) = convert_index(inner); - return (m_data.value(p) = 0); + return (m_data.value(p) = Scalar(0)); } private: @@ -1274,7 +1275,7 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& m_innerNonZeros[outer]++; m_data.index(p) = inner; - return (m_data.value(p) = 0); + return (m_data.value(p) = Scalar(0)); } template @@ -1301,11 +1302,11 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& // starts with: [ 0 0 0 0 0 1 ...] and we are inserted in, e.g., // the 2nd inner vector... bool isLastVec = (!(previousOuter==-1 && m_data.size()!=0)) - && (size_t(m_outerIndex[outer+1]) == m_data.size()); + && (std::size_t(m_outerIndex[outer+1]) == m_data.size()); - size_t startId = m_outerIndex[outer]; - // FIXME let's make sure sizeof(long int) == sizeof(size_t) - size_t p = m_outerIndex[outer+1]; + std::size_t startId = m_outerIndex[outer]; + // FIXME let's make sure sizeof(long int) == sizeof(std::size_t) + std::size_t p = m_outerIndex[outer+1]; ++m_outerIndex[outer+1]; double reallocRatio = 1; @@ -1381,7 +1382,7 @@ EIGEN_DONT_INLINE typename SparseMatrix<_Scalar,_Options,_StorageIndex>::Scalar& } m_data.index(p) = inner; - return (m_data.value(p) = 0); + return (m_data.value(p) = Scalar(0)); } namespace internal { diff --git a/eigenlib/Eigen/src/SparseCore/SparseSelfAdjointView.h b/eigenlib/Eigen/src/SparseCore/SparseSelfAdjointView.h index 9e39be73..76117a01 100644 --- a/eigenlib/Eigen/src/SparseCore/SparseSelfAdjointView.h +++ b/eigenlib/Eigen/src/SparseCore/SparseSelfAdjointView.h @@ -47,6 +47,7 @@ template class SparseSelfAdjointView enum { Mode = _Mode, + TransposeMode = ((Mode & Upper) ? Lower : 0) | ((Mode & Lower) ? Upper : 0), RowsAtCompileTime = internal::traits::RowsAtCompileTime, ColsAtCompileTime = internal::traits::ColsAtCompileTime }; @@ -310,7 +311,7 @@ inline void sparse_selfadjoint_time_dense_product(const SparseLhsType& lhs, cons while (i && i.index() dstT(dst); - internal::sparse_selfadjoint_time_dense_product(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha); + internal::sparse_selfadjoint_time_dense_product(rhsNested.transpose(), lhsNested.transpose(), dstT, alpha); } }; @@ -452,7 +453,7 @@ void permute_symm_to_fullsymm(const MatrixType& mat, SparseMatrix::type::Scalar Scalar; + typedef typename remove_all::type::Scalar RhsScalar; + typedef typename remove_all::type::Scalar ResScalar; typedef typename remove_all::type::StorageIndex StorageIndex; // make sure to call innerSize/outerSize since we fake the storage order. @@ -31,7 +32,7 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r eigen_assert(lhs.outerSize() == rhs.innerSize()); // allocate a temporary buffer - AmbiVector tempVector(rows); + AmbiVector tempVector(rows); // mimics a resizeByInnerOuter: if(ResultType::IsRowMajor) @@ -63,14 +64,14 @@ static void sparse_sparse_product_with_pruning_impl(const Lhs& lhs, const Rhs& r { // FIXME should be written like this: tmp += rhsIt.value() * lhs.col(rhsIt.index()) tempVector.restart(); - Scalar x = rhsIt.value(); + RhsScalar x = rhsIt.value(); for (typename evaluator::InnerIterator lhsIt(lhsEval, rhsIt.index()); lhsIt; ++lhsIt) { tempVector.coeffRef(lhsIt.index()) += lhsIt.value() * x; } } res.startVec(j); - for (typename AmbiVector::Iterator it(tempVector,tolerance); it; ++it) + for (typename AmbiVector::Iterator it(tempVector,tolerance); it; ++it) res.insertBackByOuterInner(j,it.index()) = it.value(); } res.finalize(); @@ -85,7 +86,6 @@ struct sparse_sparse_product_with_pruning_selector; template struct sparse_sparse_product_with_pruning_selector { - typedef typename traits::type>::Scalar Scalar; typedef typename ResultType::RealScalar RealScalar; static void run(const Lhs& lhs, const Rhs& rhs, ResultType& res, const RealScalar& tolerance) @@ -129,8 +129,8 @@ struct sparse_sparse_product_with_pruning_selector ColMajorMatrixLhs; - typedef SparseMatrix ColMajorMatrixRhs; + typedef SparseMatrix ColMajorMatrixLhs; + typedef SparseMatrix ColMajorMatrixRhs; ColMajorMatrixLhs colLhs(lhs); ColMajorMatrixRhs colRhs(rhs); internal::sparse_sparse_product_with_pruning_impl(colLhs, colRhs, res, tolerance); @@ -149,7 +149,7 @@ struct sparse_sparse_product_with_pruning_selector RowMajorMatrixLhs; + typedef SparseMatrix RowMajorMatrixLhs; RowMajorMatrixLhs rowLhs(lhs); sparse_sparse_product_with_pruning_selector(rowLhs,rhs,res,tolerance); } @@ -161,7 +161,7 @@ struct sparse_sparse_product_with_pruning_selector RowMajorMatrixRhs; + typedef SparseMatrix RowMajorMatrixRhs; RowMajorMatrixRhs rowRhs(rhs); sparse_sparse_product_with_pruning_selector(lhs,rowRhs,res,tolerance); } @@ -173,7 +173,7 @@ struct sparse_sparse_product_with_pruning_selector ColMajorMatrixRhs; + typedef SparseMatrix ColMajorMatrixRhs; ColMajorMatrixRhs colRhs(rhs); internal::sparse_sparse_product_with_pruning_impl(lhs, colRhs, res, tolerance); } @@ -185,7 +185,7 @@ struct sparse_sparse_product_with_pruning_selector ColMajorMatrixLhs; + typedef SparseMatrix ColMajorMatrixLhs; ColMajorMatrixLhs colLhs(lhs); internal::sparse_sparse_product_with_pruning_impl(colLhs, rhs, res, tolerance); } diff --git a/eigenlib/Eigen/src/SparseCore/SparseView.h b/eigenlib/Eigen/src/SparseCore/SparseView.h index 7c4aea74..92b3d1f7 100644 --- a/eigenlib/Eigen/src/SparseCore/SparseView.h +++ b/eigenlib/Eigen/src/SparseCore/SparseView.h @@ -90,6 +90,7 @@ struct unary_evaluator, IteratorBased> class InnerIterator : public EvalIterator { + protected: typedef typename XprType::Scalar Scalar; public: diff --git a/eigenlib/Eigen/src/SparseLU/SparseLU.h b/eigenlib/Eigen/src/SparseLU/SparseLU.h index f883ab38..87f0efe3 100644 --- a/eigenlib/Eigen/src/SparseLU/SparseLU.h +++ b/eigenlib/Eigen/src/SparseLU/SparseLU.h @@ -43,8 +43,8 @@ template struct SparseLUMatrixURetu * Simple example with key steps * \code * VectorXd x(n), b(n); - * SparseMatrix A; - * SparseLU, COLAMDOrdering > solver; + * SparseMatrix A; + * SparseLU, COLAMDOrdering > solver; * // fill A and b; * // Compute the ordering permutation vector from the structural pattern of A * solver.analyzePattern(A); @@ -499,8 +499,6 @@ void SparseLU::factorize(const MatrixType& matrix) eigen_assert(m_analysisIsOk && "analyzePattern() should be called first"); eigen_assert((matrix.rows() == matrix.cols()) && "Only for squared matrices"); - typedef typename IndexVector::Scalar StorageIndex; - m_isInitialized = true; diff --git a/eigenlib/Eigen/src/SparseQR/SparseQR.h b/eigenlib/Eigen/src/SparseQR/SparseQR.h index 2d4498b0..7409fcae 100644 --- a/eigenlib/Eigen/src/SparseQR/SparseQR.h +++ b/eigenlib/Eigen/src/SparseQR/SparseQR.h @@ -52,7 +52,7 @@ namespace internal { * rank-revealing permutations. Use colsPermutation() to get it. * * Q is the orthogonal matrix represented as products of Householder reflectors. - * Use matrixQ() to get an expression and matrixQ().transpose() to get the transpose. + * Use matrixQ() to get an expression and matrixQ().adjoint() to get the adjoint. * You can then apply it to a vector. * * R is the sparse triangular or trapezoidal matrix. The later occurs when A is rank-deficient. @@ -65,6 +65,7 @@ namespace internal { * \implsparsesolverconcept * * \warning The input sparse matrix A must be in compressed mode (see SparseMatrix::makeCompressed()). + * \warning For complex matrices matrixQ().transpose() will actually return the adjoint matrix. * */ template @@ -196,9 +197,9 @@ class SparseQR : public SparseSolverBase > Index rank = this->rank(); - // Compute Q^T * b; + // Compute Q^* * b; typename Dest::PlainObject y, b; - y = this->matrixQ().transpose() * B; + y = this->matrixQ().adjoint() * B; b = y; // Solve with the triangular matrix R @@ -604,7 +605,7 @@ struct SparseQR_QProduct : ReturnByValue @@ -668,13 +672,14 @@ struct SparseQRMatrixQReturnType : public EigenBase(m_qr,other.derived(),false); } + // To use for operations with the adjoint of Q SparseQRMatrixQTransposeReturnType adjoint() const { return SparseQRMatrixQTransposeReturnType(m_qr); } inline Index rows() const { return m_qr.rows(); } - inline Index cols() const { return (std::min)(m_qr.rows(),m_qr.cols()); } - // To use for operations with the transpose of Q + inline Index cols() const { return m_qr.rows(); } + // To use for operations with the transpose of Q FIXME this is the same as adjoint at the moment SparseQRMatrixQTransposeReturnType transpose() const { return SparseQRMatrixQTransposeReturnType(m_qr); @@ -682,6 +687,7 @@ struct SparseQRMatrixQReturnType : public EigenBase struct SparseQRMatrixQTransposeReturnType { @@ -712,7 +718,7 @@ struct Assignment, internal: typedef typename DstXprType::StorageIndex StorageIndex; static void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op &/*func*/) { - typename DstXprType::PlainObject idMat(src.m_qr.rows(), src.m_qr.rows()); + typename DstXprType::PlainObject idMat(src.rows(), src.cols()); idMat.setIdentity(); // Sort the sparse householder reflectors if needed const_cast(&src.m_qr)->_sort_matrix_Q(); diff --git a/eigenlib/Eigen/src/StlSupport/StdDeque.h b/eigenlib/Eigen/src/StlSupport/StdDeque.h index cf1fedf9..af158f42 100644 --- a/eigenlib/Eigen/src/StlSupport/StdDeque.h +++ b/eigenlib/Eigen/src/StlSupport/StdDeque.h @@ -98,8 +98,10 @@ namespace std { { return deque_base::insert(position,x); } void insert(const_iterator position, size_type new_size, const value_type& x) { deque_base::insert(position, new_size, x); } -#elif defined(_GLIBCXX_DEQUE) && EIGEN_GNUC_AT_LEAST(4,2) +#elif defined(_GLIBCXX_DEQUE) && EIGEN_GNUC_AT_LEAST(4,2) && !EIGEN_GNUC_AT_LEAST(10, 1) // workaround GCC std::deque implementation + // GCC 10.1 doesn't let us access _Deque_impl _M_impl anymore and we have to + // fall-back to the default case void resize(size_type new_size, const value_type& x) { if (new_size < deque_base::size()) @@ -108,7 +110,7 @@ namespace std { deque_base::insert(deque_base::end(), new_size - deque_base::size(), x); } #else - // either GCC 4.1 or non-GCC + // either non-GCC or GCC between 4.1 and 10.1 // default implementation which should always work. void resize(size_type new_size, const value_type& x) { diff --git a/eigenlib/Eigen/src/StlSupport/details.h b/eigenlib/Eigen/src/StlSupport/details.h index e42ec024..2cfd13e0 100644 --- a/eigenlib/Eigen/src/StlSupport/details.h +++ b/eigenlib/Eigen/src/StlSupport/details.h @@ -22,13 +22,13 @@ namespace Eigen { class aligned_allocator_indirection : public EIGEN_ALIGNED_ALLOCATOR { public: - typedef size_t size_type; - typedef ptrdiff_t difference_type; - typedef T* pointer; - typedef const T* const_pointer; - typedef T& reference; - typedef const T& const_reference; - typedef T value_type; + typedef std::size_t size_type; + typedef std::ptrdiff_t difference_type; + typedef T* pointer; + typedef const T* const_pointer; + typedef T& reference; + typedef const T& const_reference; + typedef T value_type; template struct rebind diff --git a/eigenlib/Eigen/src/SuperLUSupport/SuperLUSupport.h b/eigenlib/Eigen/src/SuperLUSupport/SuperLUSupport.h index 50a69f30..7261c7d0 100644 --- a/eigenlib/Eigen/src/SuperLUSupport/SuperLUSupport.h +++ b/eigenlib/Eigen/src/SuperLUSupport/SuperLUSupport.h @@ -297,8 +297,8 @@ SluMatrix asSluMatrix(MatrixType& mat) template MappedSparseMatrix map_superlu(SluMatrix& sluMat) { - eigen_assert((Flags&RowMajor)==RowMajor && sluMat.Stype == SLU_NR - || (Flags&ColMajor)==ColMajor && sluMat.Stype == SLU_NC); + eigen_assert(((Flags&RowMajor)==RowMajor && sluMat.Stype == SLU_NR) + || ((Flags&ColMajor)==ColMajor && sluMat.Stype == SLU_NC)); Index outerSize = (Flags&RowMajor)==RowMajor ? sluMat.ncol : sluMat.nrow; diff --git a/eigenlib/Eigen/src/UmfPackSupport/UmfPackSupport.h b/eigenlib/Eigen/src/UmfPackSupport/UmfPackSupport.h index dc74de93..91c09ab1 100644 --- a/eigenlib/Eigen/src/UmfPackSupport/UmfPackSupport.h +++ b/eigenlib/Eigen/src/UmfPackSupport/UmfPackSupport.h @@ -10,19 +10,37 @@ #ifndef EIGEN_UMFPACKSUPPORT_H #define EIGEN_UMFPACKSUPPORT_H -namespace Eigen { +namespace Eigen { /* TODO extract L, extract U, compute det, etc... */ // generic double/complex wrapper functions: -inline void umfpack_defaults(double control[UMFPACK_CONTROL], double) +inline void umfpack_defaults(double control[UMFPACK_CONTROL], double) { umfpack_di_defaults(control); } -inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex) +inline void umfpack_defaults(double control[UMFPACK_CONTROL], std::complex) { umfpack_zi_defaults(control); } +inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], double) +{ umfpack_di_report_info(control, info);} + +inline void umfpack_report_info(double control[UMFPACK_CONTROL], double info[UMFPACK_INFO], std::complex) +{ umfpack_zi_report_info(control, info);} + +inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, double) +{ umfpack_di_report_status(control, status);} + +inline void umfpack_report_status(double control[UMFPACK_CONTROL], int status, std::complex) +{ umfpack_zi_report_status(control, status);} + +inline void umfpack_report_control(double control[UMFPACK_CONTROL], double) +{ umfpack_di_report_control(control);} + +inline void umfpack_report_control(double control[UMFPACK_CONTROL], std::complex) +{ umfpack_zi_report_control(control);} + inline void umfpack_free_numeric(void **Numeric, double) { umfpack_di_free_numeric(Numeric); *Numeric = 0; } @@ -156,6 +174,7 @@ class UmfPackLU : public SparseSolverBase > public: typedef Array UmfpackControl; + typedef Array UmfpackInfo; UmfPackLU() : m_dummy(0,0), mp_matrix(m_dummy) @@ -215,7 +234,7 @@ class UmfPackLU : public SparseSolverBase > return m_q; } - /** Computes the sparse Cholesky decomposition of \a matrix + /** Computes the sparse Cholesky decomposition of \a matrix * Note that the matrix should be column-major, and in compressed format for best performance. * \sa SparseMatrix::makeCompressed(). */ @@ -240,7 +259,7 @@ class UmfPackLU : public SparseSolverBase > { if(m_symbolic) umfpack_free_symbolic(&m_symbolic,Scalar()); if(m_numeric) umfpack_free_numeric(&m_numeric,Scalar()); - + grab(matrix.derived()); analyzePattern_impl(); @@ -267,7 +286,7 @@ class UmfPackLU : public SparseSolverBase > { return m_control; } - + /** Provides access to the control settings array used by UmfPack. * * If this array contains NaN's, the default values are used. @@ -278,7 +297,7 @@ class UmfPackLU : public SparseSolverBase > { return m_control; } - + /** Performs a numeric decomposition of \a matrix * * The given matrix must has the same sparcity than the matrix on which the pattern anylysis has been performed. @@ -293,10 +312,38 @@ class UmfPackLU : public SparseSolverBase > umfpack_free_numeric(&m_numeric,Scalar()); grab(matrix.derived()); - + factorize_impl(); } + /** Prints the current UmfPack control settings. + * + * \sa umfpackControl() + */ + void umfpackReportControl() + { + umfpack_report_control(m_control.data(), Scalar()); + } + + /** Prints statistics collected by UmfPack. + * + * \sa analyzePattern(), compute() + */ + void umfpackReportInfo() + { + eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()"); + umfpack_report_info(m_control.data(), m_umfpackInfo.data(), Scalar()); + } + + /** Prints the status of the previous factorization operation performed by UmfPack (symbolic or numerical factorization). + * + * \sa analyzePattern(), compute() + */ + void umfpackReportStatus() { + eigen_assert(m_analysisIsOk && "UmfPackLU: you must first call analyzePattern()"); + umfpack_report_status(m_control.data(), m_fact_errorCode, Scalar()); + } + /** \internal */ template bool _solve_impl(const MatrixBase &b, MatrixBase &x) const; @@ -314,41 +361,42 @@ class UmfPackLU : public SparseSolverBase > m_numeric = 0; m_symbolic = 0; m_extractedDataAreDirty = true; + + umfpack_defaults(m_control.data(), Scalar()); } - + void analyzePattern_impl() { - umfpack_defaults(m_control.data(), Scalar()); - int errorCode = 0; - errorCode = umfpack_symbolic(internal::convert_index(mp_matrix.rows()), - internal::convert_index(mp_matrix.cols()), - mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(), - &m_symbolic, m_control.data(), 0); + m_fact_errorCode = umfpack_symbolic(internal::convert_index(mp_matrix.rows()), + internal::convert_index(mp_matrix.cols()), + mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(), + &m_symbolic, m_control.data(), m_umfpackInfo.data()); m_isInitialized = true; - m_info = errorCode ? InvalidInput : Success; + m_info = m_fact_errorCode ? InvalidInput : Success; m_analysisIsOk = true; m_factorizationIsOk = false; m_extractedDataAreDirty = true; } - + void factorize_impl() { + m_fact_errorCode = umfpack_numeric(mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(), - m_symbolic, &m_numeric, m_control.data(), 0); + m_symbolic, &m_numeric, m_control.data(), m_umfpackInfo.data()); m_info = m_fact_errorCode == UMFPACK_OK ? Success : NumericalIssue; m_factorizationIsOk = true; m_extractedDataAreDirty = true; } - + template void grab(const EigenBase &A) { mp_matrix.~UmfpackMatrixRef(); ::new (&mp_matrix) UmfpackMatrixRef(A.derived()); } - + void grab(const UmfpackMatrixRef &A) { if(&(A.derived()) != &mp_matrix) @@ -357,19 +405,20 @@ class UmfPackLU : public SparseSolverBase > ::new (&mp_matrix) UmfpackMatrixRef(A); } } - + // cached data to reduce reallocation, etc. mutable LUMatrixType m_l; int m_fact_errorCode; UmfpackControl m_control; - + mutable UmfpackInfo m_umfpackInfo; + mutable LUMatrixType m_u; mutable IntColVectorType m_p; mutable IntRowVectorType m_q; UmfpackMatrixType m_dummy; UmfpackMatrixRef mp_matrix; - + void* m_numeric; void* m_symbolic; @@ -377,7 +426,7 @@ class UmfPackLU : public SparseSolverBase > int m_factorizationIsOk; int m_analysisIsOk; mutable bool m_extractedDataAreDirty; - + private: UmfPackLU(const UmfPackLU& ) { } }; @@ -427,7 +476,7 @@ bool UmfPackLU::_solve_impl(const MatrixBase &b, MatrixBas eigen_assert((BDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major rhs yet"); eigen_assert((XDerived::Flags&RowMajorBit)==0 && "UmfPackLU backend does not support non col-major result yet"); eigen_assert(b.derived().data() != x.derived().data() && " Umfpack does not support inplace solve"); - + int errorCode; Scalar* x_ptr = 0; Matrix x_tmp; @@ -442,7 +491,7 @@ bool UmfPackLU::_solve_impl(const MatrixBase &b, MatrixBas x_ptr = &x.col(j).coeffRef(0); errorCode = umfpack_solve(UMFPACK_A, mp_matrix.outerIndexPtr(), mp_matrix.innerIndexPtr(), mp_matrix.valuePtr(), - x_ptr, &b.const_cast_derived().col(j).coeffRef(0), m_numeric, m_control.data(), 0); + x_ptr, &b.const_cast_derived().col(j).coeffRef(0), m_numeric, m_control.data(), m_umfpackInfo.data()); if(x.innerStride()!=1) x.col(j) = x_tmp; if (errorCode!=0) diff --git a/eigenlib/Eigen/src/plugins/ArrayCwiseBinaryOps.h b/eigenlib/Eigen/src/plugins/ArrayCwiseBinaryOps.h index 1f8a531a..05a7449b 100644 --- a/eigenlib/Eigen/src/plugins/ArrayCwiseBinaryOps.h +++ b/eigenlib/Eigen/src/plugins/ArrayCwiseBinaryOps.h @@ -119,7 +119,7 @@ OP(const Scalar& s) const { \ return this->OP(Derived::PlainObject::Constant(rows(), cols(), s)); \ } \ EIGEN_DEVICE_FUNC friend EIGEN_STRONG_INLINE const RCmp ## COMPARATOR ## ReturnType \ -OP(const Scalar& s, const Derived& d) { \ +OP(const Scalar& s, const EIGEN_CURRENT_STORAGE_BASE_CLASS& d) { \ return Derived::PlainObject::Constant(d.rows(), d.cols(), s).OP(d); \ } diff --git a/eigenlib/howto.txt b/eigenlib/howto.txt index 5f893409..99b20682 100644 --- a/eigenlib/howto.txt +++ b/eigenlib/howto.txt @@ -3,11 +3,12 @@ Current Eigen Version 3.2.1 (26.02.2014) updated on 14/05/2014 Current Eigen Version 3.2.2 (04.08.2014) updated on 21/10/2014 Current Eigen Version 3.2.5 (16.06.2015) updated on 24/09/2015 Current Eigen Version 3.3.2 (18.01.2017) updated on 24/01/2017 +Current Eigen Version 3.3.9 (04.12.2020) updated on 15/06/2021 To update the lib: - download Eigen - unzip it somewhere -- delete (in the filesystem) the content of the folder eigenlib/Eigen - copy the folders 'Eigen' there +- delete (in the filesystem) the content of the folder eigenlib/Eigen - copy the folders 'Eigen' there - check the git status. - update this file - commit everything