Matrix<float> slow than Matrix<double>

I'm new of Eigen, write two test case to compare double/float matrix multiply time. Found that float version is slow than double version.
The calculaiton is that:
 m3(i, j) = m1(i, j) * m2(i, j)

If the calculation replaced with:
m3(i, j) = m1(i, j) * factor
, float version time is about half of double.

My question is: Why float version is slow than double version, and how to resove it ?

#include <iostream>
#include <gtest/gtest.h>
#include <glog/logging.h>
#include <vector>
#include <map>

#ifndef BOOST_TEST
#include <boost/date_time.hpp>

using namespace std;
using namespace Eigen
TEST(MatrixCompare, DoubleSpeed) {
    const double factor = 2.3;
   Matrix<double, Dynamic, Dynamic, RowMajor> m1(1600, 2000);
   Matrix<double, Dynamic, Dynamic, RowMajor> m2(1600, 2000);
    Matrix<double, Dynamic, Dynamic, RowMajor> m3(1600, 2000);

   for (int i = 0; i < m1.rows(); i++) {
      for (int j = 0; j < m1.cols(); j++) {
         m1(i, j) = (1 + j) * (i + 1);
            m2(i, j) = (i + 2) * (j + 2);
   namespace bpt = boost::posix_time;
   const bpt::ptime tm_begin1 = bpt::microsec_clock::local_time();
   for (int i = 0; i < m1.rows(); i++) {
      for (int j = 0; j < m1.cols(); j++) {
         m3(i, j) = m1(i, j) * m2(i, j);

   const bpt::ptime  tm_end1 = boost::posix_time::microsec_clock::local_time();
   const bpt::time_duration dur1 = tm_end1 - tm_begin1;
   cout << "DoubleSpeed time: " << dur1.total_microseconds() << " us" << endl;

   m3 = m1 * factor;
   cout << m3.block(0, 0, 5, 5) << endl;

TEST(MatrixCompare, FloatSpeed) {
    const float factor = 2.3;
   Matrix<float, Dynamic, Dynamic, RowMajor> m1(1600, 2000);
   Matrix<float, Dynamic, Dynamic, RowMajor> m2(1600, 2000);
    Matrix<float, Dynamic, Dynamic, RowMajor> m3(1600, 2000);

   for (int i = 0; i < m1.rows(); i++) {
      for (int j = 0; j < m1.cols(); j++) {
         m1(i, j) = (1 + j) * (i + 1);
            m2(i, j) = (i + 2) * (j + 2);
   namespace bpt = boost::posix_time;
   const bpt::ptime tm_begin1 = bpt::microsec_clock::local_time();
   for (int i = 0; i < m1.rows(); i++) {
      for (int j = 0; j < m1.cols(); j++) {
         m3(i, j) = m1(i, j) * m2(i, j);

   const bpt::ptime  tm_end1 = boost::posix_time::microsec_clock::local_time();
   const bpt::time_duration dur1 = tm_end1 - tm_begin1;
   cout << "FloatSpeed time: " << dur1.total_microseconds() << " us" << endl;

   m3 = m1 * factor;
   cout << m3.block(0, 0, 5, 5) << endl;

Link command looks like this(seems no optimization):
g++   -Wno-unused-local-typedefs -Wall -std=c++0x -fPIC   CMakeFiles/test_measurer.dir/main.cpp.o  -o ../../../../bin/test_measurer  -L/meda_home/toolchain/tool/gperftools/2.7/lib  -L/meda_home/toolchain/library/gtest/1.10.0/lib64  -L/meda_home/toolchain/library/glog/0.4.0/lib64  -L/meda_home/toolchain/library/boost/1.72.0/lib  -L/meda_home/toolchain/library/fftw/3.3.8/lib  -L/meda_home/toolchain/library/opencv/  -L/meda_home/toolchain/library/nlopt/2.6.2/lib64  -L/meda_home/roderick/optics/libs  -L/meda_home/sys/build/noah/release/lib  -L/meda_home/sys/build/amutil/lib  -L/meda_home/toolchain/library/fftw/3.3.8/double/lib  -L/meda_home/toolchain/library/fftw/3.3.8/float/lib  -L/meda_home/sys_tool/opt_tool/mkl/lib/intel64 -rdynamic ../../opt/libamsim_basic.a -lgauge -lpthread -lboost_program_options -lboost_filesystem -lboost_system -lOAGEO -lOAIO -lOADB -lOAutil -lz -lglog -lgtest -lfftw3 -lfftw3f -lnlopt -lopencv_core -lopencv_objdetect -lopencv_imgproc -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl -lboost_program_options -lboost_filesystem -lboost_system -lOAGEO -lOAIO -lOADB -lOAutil -lz -lglog -lgtest -lfftw3 -lfftw3f -lnlopt -lopencv_core -lopencv_objdetect -lopencv_imgproc -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core -lm -ldl -Wl,-rpath,/meda_home/toolchain/tool/gperftools/2.7/lib:/meda_home/toolchain/library/gtest/1.10.0/lib64:/meda_home/toolchain/library/glog/0.4.0/lib64:/meda_home/toolchain/library/boost/1.72.0/lib:/meda_home/toolchain/library/fftw/3.3.8/lib:/meda_home/toolchain/library/opencv/

In another case, float version is slower than double version too.
matrix_of_complex3(i, j) = matrix_of_complex1(i, j) * matrix_of_complex2(i, j)

constexpr int g_row_num = 4500;
constexpr int g_col_num = 3200;

TEST(ComplexDouble, Timing) {
    double d1 = 3.14;
    complex<double> c1(3.4, 4.3);
    auto m1 = Matrix<complex<double>, Dynamic, Dynamic, RowMajor>::Random(g_row_num, g_col_num);
    Matrix<complex<double>, Dynamic, Dynamic, RowMajor> m2(g_row_num, g_col_num);

    for (int i = 0; i < m1.rows(); i++) {
        for (int j = 0; j < m1.cols(); j++) {
            m2(i, j) = m1(i, j) * c1 * d1;

    cout << m2.block(0, 0, 10, 10) << endl;

TEST(ComplexFloat, Timing) {
    float d1 = 3.14;
    complex<float> c1(3.4, 4.3);
    auto m1 = Matrix<complex<float>, Dynamic, Dynamic, RowMajor>::Random(g_row_num, g_col_num);
    Matrix<complex<float>, Dynamic, Dynamic, RowMajor> m2(g_row_num, g_col_num);

    for (int i = 0; i < m1.rows(); i++) {
        for (int j = 0; j < m1.cols(); j++) {
            m2(i, j) = m1(i, j) * c1 * d1;

    cout << m2.block(0, 0, 10, 10) << endl;

Test Result:
[----------] 1 test from ComplexDouble
[ RUN      ] ComplexDouble.Timing
  (11.9622,-4.76391)    (7.57194,-17.271)    (16.291,0.468158)     (7.156,-10.0055)   (1.64442,-18.4684)   (-19.4887,3.04485)     (6.8915,9.43352)  (-7.39686,-3.10288)   (-3.19588,3.59287)     (10.268,12.6298)
   (16.6181,4.57214)   (-9.22271,5.75099)   (6.44319,-1.28841)   (-4.33261,12.1227)   (-1.8963,-5.30527)   (14.1994,-9.57226)   (16.5955,-1.69768)  (-7.18996,0.610687)   (13.6211,-9.60615)    (5.61387,14.0553)
  (11.9204,-7.39493)  (-1.54411,-5.46907)   (-6.65892,-16.563)   (6.04319,-7.77907)   (1.92263,-9.32407)  (1.45918,-0.386767)    (6.26973,-15.928)   (18.8894,-1.34474)  (-5.40889,-7.94799)   (6.57231,-3.12474)
    (6.97008,10.865)    (2.36426,19.2347)   (4.52054,-4.40613)      (1.356,16.9818)  (-11.6305,0.188956)  (18.4705,-0.715135)    (17.3658,6.03462)   (4.80237,-15.0537)    (4.29732,2.21504)   (8.84842,-7.49431)
  (-13.1855,3.04028)   (-8.47538,3.31153)  (-21.3886,-1.45713)    (5.62472,3.28542)    (14.8775,-6.9832)  (-6.68966,-16.5691)   (9.77552,-4.40095)    (-10.2316,8.0795)  (-19.3376,-4.61249)   (-7.87485,9.81641)
  (-6.27428,1.45117)   (-6.85766,16.9591)   (-5.56633,8.77744)  (-11.0054,-11.3161)    (11.5139,11.4778)   (3.44057,-1.83339)    (-2.21169,13.547) (-19.5544,-0.383666)   (6.37967,-18.5878)   (-7.60339,13.3739)
 (-4.85457,-14.4045)     (10.5154,7.3036)  (-4.60871,-12.6508)   (0.596507,6.24256)   (0.982165,14.1581)     (2.91103,8.7621)   (4.24564,-17.5644)   (10.3552,-3.54941)   (-8.57908,8.11857)    (2.3476,-5.43621)
  (7.09012,-4.06822) (-1.27914,-0.471844)   (-4.6466,-15.8335)    (15.4649,5.15193)   (-5.4594,-8.66568)   (1.28489,-10.0818)    (-1.6819,16.4548)    (2.46613,2.62347)   (-9.30432,6.08379)   (3.03018,-17.4072)
  (-1.44176,21.6909)   (-14.2472,4.00012)   (14.4144,-6.31735)  (0.814833,-18.1089)   (-6.09504,18.2304)   (0.592451,8.40779)  (-11.1253,-13.1294)  (-18.6959,-1.92467)   (5.56531,-6.52042)   (-3.51627,12.0503)
  (-6.52932,-3.7738)    (2.09978,6.23612)   (5.90467,-10.5818)   (5.24844,-10.9746)    (-11.052,-12.669)    (1.21105,20.9772)    (2.20667,7.22686)    (21.6756,3.06041)   (-8.50669,1.68755)  (-2.46858,-1.46122)
[       OK ] ComplexDouble.Timing (2757 ms)
[----------] 1 test from ComplexDouble (2757 ms total)

[----------] 1 test from ComplexFloat
[ RUN      ] ComplexFloat.Timing
  (9.41872,-14.0621)    (4.53709,2.52084)    (-2.6496,15.6391)   (-5.41675,15.8699)  (0.964372,-19.3973)    (12.0572,2.70792)  (-9.98501,-6.93664)    (-11.325,6.51012)    (20.2115,4.01326)    (17.3532,4.42964)
  (0.838134,-14.216)   (1.79402,-23.3493)    (19.8017,1.19708)  (-1.72511,-8.55702)   (12.3892,-12.0709)   (2.92304,-12.4514)   (-5.48972,8.32045)   (9.17595,-12.4055)   (-15.874,-8.17748)   (-4.20945,20.6047)
 (-15.1947,-9.38803)   (4.45996,-10.3651)    (-9.1915,5.23389)   (17.1325,-4.05876)   (-9.26078,15.7354)     (15.2728,4.9197)  (-0.850976,4.11447)    (5.76246,15.0008)    (-7.47418,8.1391)    (19.9157,4.62661)
   (3.45256,6.70097)  (-12.0277,-7.92463)   (15.5638,-7.26486)  (-14.9532,-9.98796)  (-3.84551,-4.52037)   (-8.92488,14.6273)   (-11.3693,8.64566)    (-14.6817,9.1834)   (-13.243,-10.9918)  (-1.89669,-6.08764)
   (22.0843,3.44771)   (10.2037,-6.08624)   (0.451633,10.7834)    (4.25173,13.5585)    (6.1873,-9.23388)   (10.4017,-11.7972)   (-5.33181,16.3047)    (2.80823,9.51654)  (-15.2553,-2.96387)   (-1.80178,-17.171)
  (6.65584,-4.20219)  (-22.2693,-2.40002)    (12.5152,8.60855)   (9.82279,-13.5401)  (-17.4628,-3.26002)   (3.23608,-5.22377)   (-6.6794,-13.5688)  (-13.6475,-4.83143)    (15.8706,5.57878)    (5.3668,-1.66928)
 (15.8426,-0.712177)    (9.66653,13.1371)   (13.6225,-10.4194)   (7.43235,-12.2147)    (15.6721,6.83542)   (15.3532,-3.23863)  (-2.73261,-4.98022)   (3.03679,-4.74562)  (-5.30063,0.818373)  (0.0588816,17.0419)
   (18.9452,3.67243)  (-6.96607,-9.55796)   (6.56714,-1.74657)   (-6.39578,13.2946)  (-8.58117,-5.47089)   (4.47595,-10.9374)    (3.73198,16.1942)   (-3.81166,6.63538)    (2.91362,9.25257)     (-3.6649,11.555)
  (-7.38876,8.28783)    (6.68414,11.4678)  (-7.58146,-4.65234)   (-11.9241,9.13819)  (0.233409,-14.5396) (-22.4666,-0.722411)   (-7.25917,9.77754)     (6.40359,6.2519) (-0.521005,-9.46367)  (-3.76615,-4.71643)
 (-10.9649,-1.49071) (-4.94397,-0.898059)   (15.0439,-6.12483)  (-5.51891,-12.6134)    (10.9279,7.80845)   (-11.1334,12.1361)     (1.3841,8.58137)   (0.814455,7.27993)  (-4.54082,-13.1875)   (-3.30465,22.8728)
[       OK ] ComplexFloat.Timing (3540 ms)
[----------] 1 test from ComplexFloat (3540 ms total)

gcc compile command:
g++   -Wno-unused-local-typedefs -Wall -std=c++0x -fPIC -lprofiler -DMKL_ILP64 -m64   CMakeFiles/test_util.dir/main.cpp.o  -o ../../../../bin/test_util  -L/meda_home/toolchain/tool/gperftools/2.7/lib  -L/meda_home/toolchain/library/gtest/1.10.0/lib64  -L/meda_home/toolchain/library/glog/0.4.0/lib64  -L/meda_home/toolchain/library/boost/1.72.0/lib  -L/meda_home/toolchain/library/fftw/3.3.8/lib  -L/meda_home/toolchain/library/opencv/  -L/meda_home/toolchain/library/nlopt/2.6.2/lib64  -L/meda_home/roderick/optics/libs  -L/meda_home/sys/build/noah/release/lib  -L/meda_home/sys/build/amutil/lib  -L/meda_home/toolchain/library/fftw/3.3.8/double/lib  -L/meda_home/toolchain/library/fftw/3.3.8/float/lib  -L/meda_home/sys_tool/opt_tool/mkl/lib/intel64 -rdynamic ../../opt/libamsim_basic.a -lgauge -lpthread -lboost_program_options -lboost_filesystem -lboost_system -lOAGEO -lOAIO -lOADB -lOAutil -lz -lglog -lgtest -lfftw3 -lfftw3f -lnlopt -lopencv_core -lopencv_objdetect -lopencv_imgproc -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl -lboost_program_options -lboost_filesystem -lboost_system -lOAGEO -lOAIO -lOADB -lOAutil -lz -lglog -lgtest -lfftw3 -lfftw3f -lnlopt -lopencv_core -lopencv_objdetect -lopencv_imgproc -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core -lm -ldl -Wl,-rpath,/meda_home/toolchain/tool/gperftools/2.7/lib:/meda_home/toolchain/library/gtest/1.10.0/lib64:/meda_home/toolchain/library/glog/0.4.0/lib64:/meda_home/toolchain/library/boost/1.72.0/lib:/meda_home/toolchain/library/fftw/3.3.8/lib:/meda_home/toolchain/library/opencv/

CPU infomation
processor : 31
vendor_id : GenuineIntel
cpu family : 6
model : 85
model name : Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz
stepping : 4
microcode : 0x2006906
cpu MHz : 2999.998
cache size : 25344 KB
physical id : 0
siblings : 36
core id : 13
cpu cores : 18
apicid : 27
initial apicid : 27
fpu : yes
fpu_exception : yes
cpuid level : 13
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 ida arat
bogomips : 5999.99
clflush size : 64
cache_alignment : 64
address sizes : 46 bits physical, 48 bits virtual
You say you're using matrix-multiplication, but your code represents element-wise multiplication. Is this intended?

And there's no reason why the float implementation would be faster or slower than the double implementation in your tests.
There are some interesting things to say about doubles and floats in high performance computing, but the difference in your tests is more likely random noise.
With the time difference being the value of 783ms or factor of 1.3 times between one profiled test, I don't think you can really say it's slower.
The variability I get between running the same code (similar to yours) over and over is about that large anyways.


