This forum has been archived. All content is frozen. Please use KDE Discuss instead.

Matrix<float> slow than Matrix<double>

Tags: None
(comma "," separated)
kde-roderick
Registered Member
Posts
2
Karma
0
I'm new of Eigen, write two test case to compare double/float matrix multiply time. Found that float version is slow than double version.
The calculaiton is that:
Code: Select all
 m3(i, j) = m1(i, j) * m2(i, j)

If the calculation replaced with:
Code: Select all
m3(i, j) = m1(i, j) * factor
, float version time is about half of double.

My question is: Why float version is slow than double version, and how to resove it ?

Code: Select all
#include <iostream>
#include <gtest/gtest.h>
#include <glog/logging.h>
#include <vector>
#include <map>

#ifndef BOOST_TEST
#define BOOST_DATE_TIME_SOURCE
#include <boost/date_time.hpp>
#endif


using namespace std;
using namespace Eigen
TEST(MatrixCompare, DoubleSpeed) {
    const double factor = 2.3;
   Matrix<double, Dynamic, Dynamic, RowMajor> m1(1600, 2000);
   Matrix<double, Dynamic, Dynamic, RowMajor> m2(1600, 2000);
    Matrix<double, Dynamic, Dynamic, RowMajor> m3(1600, 2000);

   for (int i = 0; i < m1.rows(); i++) {
      for (int j = 0; j < m1.cols(); j++) {
         m1(i, j) = (1 + j) * (i + 1);
            m2(i, j) = (i + 2) * (j + 2);
      }
   }
   
   namespace bpt = boost::posix_time;
   const bpt::ptime tm_begin1 = bpt::microsec_clock::local_time();
   
   for (int i = 0; i < m1.rows(); i++) {
      for (int j = 0; j < m1.cols(); j++) {
         m3(i, j) = m1(i, j) * m2(i, j);
      }
   }

   const bpt::ptime  tm_end1 = boost::posix_time::microsec_clock::local_time();
   const bpt::time_duration dur1 = tm_end1 - tm_begin1;
   cout << "DoubleSpeed time: " << dur1.total_microseconds() << " us" << endl;

   m3 = m1 * factor;
   cout << m3.block(0, 0, 5, 5) << endl;
}

TEST(MatrixCompare, FloatSpeed) {
    const float factor = 2.3;
   Matrix<float, Dynamic, Dynamic, RowMajor> m1(1600, 2000);
   Matrix<float, Dynamic, Dynamic, RowMajor> m2(1600, 2000);
    Matrix<float, Dynamic, Dynamic, RowMajor> m3(1600, 2000);

   for (int i = 0; i < m1.rows(); i++) {
      for (int j = 0; j < m1.cols(); j++) {
         m1(i, j) = (1 + j) * (i + 1);
            m2(i, j) = (i + 2) * (j + 2);
      }
   }
   
   namespace bpt = boost::posix_time;
   const bpt::ptime tm_begin1 = bpt::microsec_clock::local_time();
   
   for (int i = 0; i < m1.rows(); i++) {
      for (int j = 0; j < m1.cols(); j++) {
         m3(i, j) = m1(i, j) * m2(i, j);
      }
   }

   const bpt::ptime  tm_end1 = boost::posix_time::microsec_clock::local_time();
   const bpt::time_duration dur1 = tm_end1 - tm_begin1;
   cout << "FloatSpeed time: " << dur1.total_microseconds() << " us" << endl;

   m3 = m1 * factor;
   cout << m3.block(0, 0, 5, 5) << endl;
};


Link command looks like this(seems no optimization):
Code: Select all
g++   -Wno-unused-local-typedefs -Wall -std=c++0x -fPIC   CMakeFiles/test_measurer.dir/main.cpp.o  -o ../../../../bin/test_measurer  -L/meda_home/toolchain/tool/gperftools/2.7/lib  -L/meda_home/toolchain/library/gtest/1.10.0/lib64  -L/meda_home/toolchain/library/glog/0.4.0/lib64  -L/meda_home/toolchain/library/boost/1.72.0/lib  -L/meda_home/toolchain/library/fftw/3.3.8/lib  -L/meda_home/toolchain/library/opencv/2.4.13.6/lib  -L/meda_home/toolchain/library/nlopt/2.6.2/lib64  -L/meda_home/roderick/optics/libs  -L/meda_home/sys/build/noah/release/lib  -L/meda_home/sys/build/amutil/lib  -L/meda_home/toolchain/library/fftw/3.3.8/double/lib  -L/meda_home/toolchain/library/fftw/3.3.8/float/lib  -L/meda_home/sys_tool/opt_tool/mkl/lib/intel64 -rdynamic ../../opt/libamsim_basic.a -lgauge -lpthread -lboost_program_options -lboost_filesystem -lboost_system -lOAGEO -lOAIO -lOADB -lOAutil -lz -lglog -lgtest -lfftw3 -lfftw3f -lnlopt -lopencv_core -lopencv_objdetect -lopencv_imgproc -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl -lboost_program_options -lboost_filesystem -lboost_system -lOAGEO -lOAIO -lOADB -lOAutil -lz -lglog -lgtest -lfftw3 -lfftw3f -lnlopt -lopencv_core -lopencv_objdetect -lopencv_imgproc -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core -lm -ldl -Wl,-rpath,/meda_home/toolchain/tool/gperftools/2.7/lib:/meda_home/toolchain/library/gtest/1.10.0/lib64:/meda_home/toolchain/library/glog/0.4.0/lib64:/meda_home/toolchain/library/boost/1.72.0/lib:/meda_home/toolchain/library/fftw/3.3.8/lib:/meda_home/toolchain/library/opencv/2.4.13.6/lib:/meda_home/toolchain/library/nlopt/2.6.2/lib64:/meda_home/roderick/optics/libs:/meda_home/sys/build/noah/release/lib:/meda_home/sys/build/amutil/lib:/meda_home/toolchain/library/fftw/3.3.8/double/lib:/meda_home/toolchain/library/fftw/3.3.8/float/lib:/meda_home/sys_tool/opt_tool/mkl/lib/intel6

Last edited by kde-roderick on Thu Oct 29, 2020 3:04 am, edited 1 time in total.
kde-roderick
Registered Member
Posts
2
Karma
0
In another case, float version is slower than double version too.
Code: Select all
matrix_of_complex3(i, j) = matrix_of_complex1(i, j) * matrix_of_complex2(i, j)


Code: Select all
constexpr int g_row_num = 4500;
constexpr int g_col_num = 3200;

TEST(ComplexDouble, Timing) {
    double d1 = 3.14;
    complex<double> c1(3.4, 4.3);
    auto m1 = Matrix<complex<double>, Dynamic, Dynamic, RowMajor>::Random(g_row_num, g_col_num);
    Matrix<complex<double>, Dynamic, Dynamic, RowMajor> m2(g_row_num, g_col_num);

    for (int i = 0; i < m1.rows(); i++) {
        for (int j = 0; j < m1.cols(); j++) {
            m2(i, j) = m1(i, j) * c1 * d1;
        }
    }

    cout << m2.block(0, 0, 10, 10) << endl;
}

TEST(ComplexFloat, Timing) {
    float d1 = 3.14;
    complex<float> c1(3.4, 4.3);
    auto m1 = Matrix<complex<float>, Dynamic, Dynamic, RowMajor>::Random(g_row_num, g_col_num);
    Matrix<complex<float>, Dynamic, Dynamic, RowMajor> m2(g_row_num, g_col_num);

    for (int i = 0; i < m1.rows(); i++) {
        for (int j = 0; j < m1.cols(); j++) {
            m2(i, j) = m1(i, j) * c1 * d1;
        }
    }

    cout << m2.block(0, 0, 10, 10) << endl;
}



Test Result:
Code: Select all
[----------] 1 test from ComplexDouble
[ RUN      ] ComplexDouble.Timing
  (11.9622,-4.76391)    (7.57194,-17.271)    (16.291,0.468158)     (7.156,-10.0055)   (1.64442,-18.4684)   (-19.4887,3.04485)     (6.8915,9.43352)  (-7.39686,-3.10288)   (-3.19588,3.59287)     (10.268,12.6298)
   (16.6181,4.57214)   (-9.22271,5.75099)   (6.44319,-1.28841)   (-4.33261,12.1227)   (-1.8963,-5.30527)   (14.1994,-9.57226)   (16.5955,-1.69768)  (-7.18996,0.610687)   (13.6211,-9.60615)    (5.61387,14.0553)
  (11.9204,-7.39493)  (-1.54411,-5.46907)   (-6.65892,-16.563)   (6.04319,-7.77907)   (1.92263,-9.32407)  (1.45918,-0.386767)    (6.26973,-15.928)   (18.8894,-1.34474)  (-5.40889,-7.94799)   (6.57231,-3.12474)
    (6.97008,10.865)    (2.36426,19.2347)   (4.52054,-4.40613)      (1.356,16.9818)  (-11.6305,0.188956)  (18.4705,-0.715135)    (17.3658,6.03462)   (4.80237,-15.0537)    (4.29732,2.21504)   (8.84842,-7.49431)
  (-13.1855,3.04028)   (-8.47538,3.31153)  (-21.3886,-1.45713)    (5.62472,3.28542)    (14.8775,-6.9832)  (-6.68966,-16.5691)   (9.77552,-4.40095)    (-10.2316,8.0795)  (-19.3376,-4.61249)   (-7.87485,9.81641)
  (-6.27428,1.45117)   (-6.85766,16.9591)   (-5.56633,8.77744)  (-11.0054,-11.3161)    (11.5139,11.4778)   (3.44057,-1.83339)    (-2.21169,13.547) (-19.5544,-0.383666)   (6.37967,-18.5878)   (-7.60339,13.3739)
 (-4.85457,-14.4045)     (10.5154,7.3036)  (-4.60871,-12.6508)   (0.596507,6.24256)   (0.982165,14.1581)     (2.91103,8.7621)   (4.24564,-17.5644)   (10.3552,-3.54941)   (-8.57908,8.11857)    (2.3476,-5.43621)
  (7.09012,-4.06822) (-1.27914,-0.471844)   (-4.6466,-15.8335)    (15.4649,5.15193)   (-5.4594,-8.66568)   (1.28489,-10.0818)    (-1.6819,16.4548)    (2.46613,2.62347)   (-9.30432,6.08379)   (3.03018,-17.4072)
  (-1.44176,21.6909)   (-14.2472,4.00012)   (14.4144,-6.31735)  (0.814833,-18.1089)   (-6.09504,18.2304)   (0.592451,8.40779)  (-11.1253,-13.1294)  (-18.6959,-1.92467)   (5.56531,-6.52042)   (-3.51627,12.0503)
  (-6.52932,-3.7738)    (2.09978,6.23612)   (5.90467,-10.5818)   (5.24844,-10.9746)    (-11.052,-12.669)    (1.21105,20.9772)    (2.20667,7.22686)    (21.6756,3.06041)   (-8.50669,1.68755)  (-2.46858,-1.46122)
[       OK ] ComplexDouble.Timing (2757 ms)
[----------] 1 test from ComplexDouble (2757 ms total)

[----------] 1 test from ComplexFloat
[ RUN      ] ComplexFloat.Timing
  (9.41872,-14.0621)    (4.53709,2.52084)    (-2.6496,15.6391)   (-5.41675,15.8699)  (0.964372,-19.3973)    (12.0572,2.70792)  (-9.98501,-6.93664)    (-11.325,6.51012)    (20.2115,4.01326)    (17.3532,4.42964)
  (0.838134,-14.216)   (1.79402,-23.3493)    (19.8017,1.19708)  (-1.72511,-8.55702)   (12.3892,-12.0709)   (2.92304,-12.4514)   (-5.48972,8.32045)   (9.17595,-12.4055)   (-15.874,-8.17748)   (-4.20945,20.6047)
 (-15.1947,-9.38803)   (4.45996,-10.3651)    (-9.1915,5.23389)   (17.1325,-4.05876)   (-9.26078,15.7354)     (15.2728,4.9197)  (-0.850976,4.11447)    (5.76246,15.0008)    (-7.47418,8.1391)    (19.9157,4.62661)
   (3.45256,6.70097)  (-12.0277,-7.92463)   (15.5638,-7.26486)  (-14.9532,-9.98796)  (-3.84551,-4.52037)   (-8.92488,14.6273)   (-11.3693,8.64566)    (-14.6817,9.1834)   (-13.243,-10.9918)  (-1.89669,-6.08764)
   (22.0843,3.44771)   (10.2037,-6.08624)   (0.451633,10.7834)    (4.25173,13.5585)    (6.1873,-9.23388)   (10.4017,-11.7972)   (-5.33181,16.3047)    (2.80823,9.51654)  (-15.2553,-2.96387)   (-1.80178,-17.171)
  (6.65584,-4.20219)  (-22.2693,-2.40002)    (12.5152,8.60855)   (9.82279,-13.5401)  (-17.4628,-3.26002)   (3.23608,-5.22377)   (-6.6794,-13.5688)  (-13.6475,-4.83143)    (15.8706,5.57878)    (5.3668,-1.66928)
 (15.8426,-0.712177)    (9.66653,13.1371)   (13.6225,-10.4194)   (7.43235,-12.2147)    (15.6721,6.83542)   (15.3532,-3.23863)  (-2.73261,-4.98022)   (3.03679,-4.74562)  (-5.30063,0.818373)  (0.0588816,17.0419)
   (18.9452,3.67243)  (-6.96607,-9.55796)   (6.56714,-1.74657)   (-6.39578,13.2946)  (-8.58117,-5.47089)   (4.47595,-10.9374)    (3.73198,16.1942)   (-3.81166,6.63538)    (2.91362,9.25257)     (-3.6649,11.555)
  (-7.38876,8.28783)    (6.68414,11.4678)  (-7.58146,-4.65234)   (-11.9241,9.13819)  (0.233409,-14.5396) (-22.4666,-0.722411)   (-7.25917,9.77754)     (6.40359,6.2519) (-0.521005,-9.46367)  (-3.76615,-4.71643)
 (-10.9649,-1.49071) (-4.94397,-0.898059)   (15.0439,-6.12483)  (-5.51891,-12.6134)    (10.9279,7.80845)   (-11.1334,12.1361)     (1.3841,8.58137)   (0.814455,7.27993)  (-4.54082,-13.1875)   (-3.30465,22.8728)
[       OK ] ComplexFloat.Timing (3540 ms)
[----------] 1 test from ComplexFloat (3540 ms total)



gcc compile command:
Code: Select all
g++   -Wno-unused-local-typedefs -Wall -std=c++0x -fPIC -lprofiler -DMKL_ILP64 -m64   CMakeFiles/test_util.dir/main.cpp.o  -o ../../../../bin/test_util  -L/meda_home/toolchain/tool/gperftools/2.7/lib  -L/meda_home/toolchain/library/gtest/1.10.0/lib64  -L/meda_home/toolchain/library/glog/0.4.0/lib64  -L/meda_home/toolchain/library/boost/1.72.0/lib  -L/meda_home/toolchain/library/fftw/3.3.8/lib  -L/meda_home/toolchain/library/opencv/2.4.13.6/lib  -L/meda_home/toolchain/library/nlopt/2.6.2/lib64  -L/meda_home/roderick/optics/libs  -L/meda_home/sys/build/noah/release/lib  -L/meda_home/sys/build/amutil/lib  -L/meda_home/toolchain/library/fftw/3.3.8/double/lib  -L/meda_home/toolchain/library/fftw/3.3.8/float/lib  -L/meda_home/sys_tool/opt_tool/mkl/lib/intel64 -rdynamic ../../opt/libamsim_basic.a -lgauge -lpthread -lboost_program_options -lboost_filesystem -lboost_system -lOAGEO -lOAIO -lOADB -lOAutil -lz -lglog -lgtest -lfftw3 -lfftw3f -lnlopt -lopencv_core -lopencv_objdetect -lopencv_imgproc -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core -lpthread -lm -ldl -lboost_program_options -lboost_filesystem -lboost_system -lOAGEO -lOAIO -lOADB -lOAutil -lz -lglog -lgtest -lfftw3 -lfftw3f -lnlopt -lopencv_core -lopencv_objdetect -lopencv_imgproc -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core -lm -ldl -Wl,-rpath,/meda_home/toolchain/tool/gperftools/2.7/lib:/meda_home/toolchain/library/gtest/1.10.0/lib64:/meda_home/toolchain/library/glog/0.4.0/lib64:/meda_home/toolchain/library/boost/1.72.0/lib:/meda_home/toolchain/library/fftw/3.3.8/lib:/meda_home/toolchain/library/opencv/2.4.13.6/lib:/meda_home/toolchain/library/nlopt/2.6.2/lib64:/meda_home/roderick/optics/libs:/meda_home/sys/build/noah/release/lib:/meda_home/sys/build/amutil/lib:/meda_home/toolchain/library/fftw/3.3.8/double/lib:/meda_home/toolchain/library/fftw/3.3.8/float/lib:/meda_home/sys_tool/opt_tool/mkl/lib/intel64



CPU infomation
processor : 31
vendor_id : GenuineIntel
cpu family : 6
model : 85
model name : Intel(R) Xeon(R) Platinum 8124M CPU @ 3.00GHz
stepping : 4
microcode : 0x2006906
cpu MHz : 2999.998
cache size : 25344 KB
physical id : 0
siblings : 36
core id : 13
cpu cores : 18
apicid : 27
initial apicid : 27
fpu : yes
fpu_exception : yes
cpuid level : 13
wp : yes
flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl xtopology nonstop_tsc aperfmperf eagerfpu pni pclmulqdq monitor ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm mpx avx512f avx512dq rdseed adx smap clflushopt clwb avx512cd avx512bw avx512vl xsaveopt xsavec xgetbv1 ida arat
bogomips : 5999.99
clflush size : 64
cache_alignment : 64
address sizes : 46 bits physical, 48 bits virtual
andrew-dy
Registered Member
Posts
15
Karma
0
You say you're using matrix-multiplication, but your code represents element-wise multiplication. Is this intended?

And there's no reason why the float implementation would be faster or slower than the double implementation in your tests.
There are some interesting things to say about doubles and floats in high performance computing, but the difference in your tests is more likely random noise.
With the time difference being the value of 783ms or factor of 1.3 times between one profiled test, I don't think you can really say it's slower.
The variability I get between running the same code (similar to yours) over and over is about that large anyways.


Bookmarks



Who is online

Registered users: bartoloni, Bing [Bot], Google [Bot], Yahoo [Bot]