public inbox for gentoo-commits@lists.gentoo.org
 help / color / mirror / Atom feed
* [gentoo-commits] proj/auto-numerical-bench:unstable commit in: btl/generic_bench/, btl/actions/, btl/generic_bench/timers/, btl/libs/PBLAS/
@ 2011-07-18  0:41 Andrea Arteaga
  0 siblings, 0 replies; only message in thread
From: Andrea Arteaga @ 2011-07-18  0:41 UTC (permalink / raw
  To: gentoo-commits

commit:     63a0b1731ec14427c3f7e5dfbb63e5e6724e69f9
Author:     spiros <andyspiros <AT> gmail <DOT> com>
AuthorDate: Mon Jul 18 00:36:22 2011 +0000
Commit:     Andrea Arteaga <andyspiros <AT> gmail <DOT> com>
CommitDate: Mon Jul 18 00:36:22 2011 +0000
URL:        http://git.overlays.gentoo.org/gitweb/?p=proj/auto-numerical-bench.git;a=commit;h=63a0b173

Much work on distributed-memory BTL.

---
 .../action_parallel_matrix_vector_product.hh       |   11 ++-
 btl/generic_bench/bench.hh                         |  112 ++++----------------
 .../timers/distributed_perf_analyzer_node.hh       |   78 ++++++++++++++
 .../timers/distributed_perf_analyzer_root.hh       |   94 ++++++++++++++++
 btl/generic_bench/timers/portable_perf_analyzer.hh |   10 +--
 btl/libs/PBLAS/main.cpp                            |    4 +-
 6 files changed, 204 insertions(+), 105 deletions(-)

diff --git a/btl/actions/action_parallel_matrix_vector_product.hh b/btl/actions/action_parallel_matrix_vector_product.hh
index c166e01..07886a2 100644
--- a/btl/actions/action_parallel_matrix_vector_product.hh
+++ b/btl/actions/action_parallel_matrix_vector_product.hh
@@ -22,6 +22,7 @@
 #include "utilities.h"
 #include "STL_interface.hh"
 #include <string>
+#include <algorithm>
 #include "init/init_function.hh"
 #include "init/init_vector.hh"
 #include "init/init_matrix.hh"
@@ -76,9 +77,13 @@ public :
     // Descinit
     int context = Interface::context();
     int info;
-    descinit_(descA, &GlobalRows, &GlobalCols, &BlockRows, &BlockCols, &iZERO, &iZERO, &context,  &LocalRows, &info);
-    descinit_(descX, &GlobalCols,       &iONE, &BlockRows, &BlockCols, &iZERO, &iZERO, &context, &LocalXRows, &info);
-    descinit_(descY, &GlobalRows,       &iONE, &BlockRows, &BlockCols, &iZERO, &iZERO, &context, &LocalYRows, &info);
+    int LDA, LDX, LDY;
+    LDA = std::max(1, LocalRows);
+    LDX = std::max(1, LocalXRows);
+    LDY = std::max(1, LocalYRows);
+    descinit_(descA, &GlobalRows, &GlobalCols, &BlockRows, &BlockCols, &iZERO, &iZERO, &context, &LDA, &info);
+    descinit_(descX, &GlobalCols,       &iONE, &BlockRows, &BlockCols, &iZERO, &iZERO, &context, &LDX, &info);
+    descinit_(descY, &GlobalRows,       &iONE, &BlockRows, &BlockCols, &iZERO, &iZERO, &context, &LDY, &info);
   }
 
   // invalidate copy ctor

diff --git a/btl/generic_bench/bench.hh b/btl/generic_bench/bench.hh
index d9906a4..2a5ba36 100644
--- a/btl/generic_bench/bench.hh
+++ b/btl/generic_bench/bench.hh
@@ -29,21 +29,19 @@
 #include <vector>
 #include <string>
 #include "timers/portable_perf_analyzer.hh"
+#include "timers/distributed_perf_analyzer_root.hh"
+#include "timers/distributed_perf_analyzer_node.hh"
 // #include "timers/mixed_perf_analyzer.hh"
 // #include "timers/x86_perf_analyzer.hh"
 // #include "timers/STL_perf_analyzer.hh"
 #ifdef HAVE_MKL
 extern "C" void cblas_saxpy(const int, const float, const float*, const int, float *, const int);
 #endif
-using namespace std;
 
 template <template<class> class Perf_Analyzer, class Action>
 BTL_DONT_INLINE void bench( int size_min, int size_max, int nb_point, bool silent = false )
 {
-  if (BtlConfig::skipAction(Action::name()))
-    return;
-
-  string filename="bench_"+Action::name()+".dat";
+  std::string filename = "bench_"+Action::name()+".dat";
 
   if (!silent) { INFOS("starting " <<filename); }
 
@@ -55,116 +53,44 @@ BTL_DONT_INLINE void bench( int size_min, int size_max, int nb_point, bool silen
   // matrices and vector size calculations
   size_lin_log(nb_point,size_min,size_max,tab_sizes);
 
-  std::vector<int> oldSizes;
-  std::vector<double> oldFlops;
-  bool hasOldResults = read_xy_file(filename, oldSizes, oldFlops, true);
-  int oldi = oldSizes.size() - 1;
-
   // loop on matrix size
   Perf_Analyzer<Action> perf_action;
-  for (int i=nb_point-1;i>=0;i--)
+  for (int i=nb_point-1; i>=0; i--)
   {
-    //INFOS("size=" <<tab_sizes[i]<<"   ("<<nb_point-i<<"/"<<nb_point<<")");
     if (!silent)
       std::cout << " " << "size = " << tab_sizes[i] << "  " << std::flush;
 
     BTL_DISABLE_SSE_EXCEPTIONS();
-    #ifdef HAVE_MKL
-    {
-      float dummy;
-      cblas_saxpy(1,0,&dummy,1,&dummy,1);
-    }
-    #endif
 
     tab_mflops[i] = perf_action.eval_mflops(tab_sizes[i], silent);
-    if (!silent) std::cout << tab_mflops[i];
-    
-    if (hasOldResults)
-    {
-      while (oldi>=0 && oldSizes[oldi]>tab_sizes[i])
-        --oldi;
-      if (oldi>=0 && oldSizes[oldi]==tab_sizes[i] && !silent)
-      {
-        if (oldFlops[oldi]<tab_mflops[i])
-          std::cout << "\t > ";
-        else
-          std::cout << "\t < ";
-        std::cout << oldFlops[oldi];
-      }
-      --oldi;
-    }
     if (!silent)
-    std::cout << " MFlops    (" << nb_point-i << "/" << nb_point << ")" << std::endl;
-  }
-
-  if (!BtlConfig::Instance.overwriteResults)
-  {
-    if (hasOldResults)
-    {
-      // merge the two data
-      std::vector<int> newSizes;
-      std::vector<double> newFlops;
-      int i=0;
-      int j=0;
-      while (i<tab_sizes.size() && j<oldSizes.size())
-      {
-        if (tab_sizes[i] == oldSizes[j])
-        {
-          newSizes.push_back(tab_sizes[i]);
-          newFlops.push_back(std::max(tab_mflops[i], oldFlops[j]));
-          ++i;
-          ++j;
-        }
-        else if (tab_sizes[i] < oldSizes[j])
-        {
-          newSizes.push_back(tab_sizes[i]);
-          newFlops.push_back(tab_mflops[i]);
-          ++i;
-        }
-        else
-        {
-          newSizes.push_back(oldSizes[j]);
-          newFlops.push_back(oldFlops[j]);
-          ++j;
-        }
-      }
-      while (i<tab_sizes.size())
-      {
-        newSizes.push_back(tab_sizes[i]);
-        newFlops.push_back(tab_mflops[i]);
-        ++i;
-      }
-      while (j<oldSizes.size())
-      {
-        newSizes.push_back(oldSizes[j]);
-        newFlops.push_back(oldFlops[j]);
-        ++j;
-      }
-      tab_mflops = newFlops;
-      tab_sizes = newSizes;
-    }
+      std::cout << tab_mflops[i] << " MFlops    (" << nb_point-i << "/" << nb_point << ")" << std::endl;
   }
 
   // dump the result in a file  :
-  if (!silent) dump_xy_file(tab_sizes,tab_mflops,filename);
+  if (!silent) dump_xy_file(tab_sizes, tab_mflops, filename);
 
 }
 
 // default Perf Analyzer
 
 template <class Action>
-BTL_DONT_INLINE void bench( int size_min, int size_max, int nb_point, bool silent ){
-
-  // if the rdtsc is not available :
+BTL_DONT_INLINE void bench( int size_min, int size_max, int nb_point, bool silent = false)
+{
   bench<Portable_Perf_Analyzer,Action>(size_min,size_max,nb_point,silent);
-  // if the rdtsc is available :
-//    bench<Mixed_Perf_Analyzer,Action>(size_min,size_max,nb_point);
-
+}
 
-  // Only for small problem size. Otherwize it will be too long
-//   bench<X86_Perf_Analyzer,Action>(size_min,size_max,nb_point);
-//   bench<STL_Perf_Analyzer,Action>(size_min,size_max,nb_point);
+// distributed Perf Analyzer
 
+template <class Action>
+BTL_DONT_INLINE void distr_bench( int size_min, int size_max, int nb_point, bool silent = false)
+{
+  int myid, nproc;
+  blacs_pinfo_(&myid, &nproc);
+  if (myid)
+    bench<Distributed_Perf_Analyzer_Node, Action>(size_min, size_max, nb_point, silent);
+  else
+    bench<Distributed_Perf_Analyzer_Root, Action>(size_min, size_max, nb_point, silent);
 }
 
 #endif

diff --git a/btl/generic_bench/timers/distributed_perf_analyzer_node.hh b/btl/generic_bench/timers/distributed_perf_analyzer_node.hh
new file mode 100644
index 0000000..7399d30
--- /dev/null
+++ b/btl/generic_bench/timers/distributed_perf_analyzer_node.hh
@@ -0,0 +1,78 @@
+#ifndef _PORTABLE_PERF_ANALYZER_NODE_HH
+#define _PORTABLE_PERF_ANALYZER_NODE_HH
+
+#include "utilities.h"
+#include "timers/portable_timer.hh"
+#include "blacs.h"
+
+template <class Action>
+class Distributed_Perf_Analyzer_Node{
+public:
+  Distributed_Perf_Analyzer_Node( ):_nb_calc(0){
+    MESSAGE("Distributed_Perf_Analyzer_Node Ctor");
+    int temp, what = 0;
+    blacs_get_(&temp, &what, &context);
+  };
+  Distributed_Perf_Analyzer_Node( const Distributed_Perf_Analyzer_Node& ){
+    INFOS("Copy Ctor not implemented");
+    exit(0);
+  };
+  ~Distributed_Perf_Analyzer_Node(){
+    MESSAGE("Distributed_Perf_Analyzer_Node Dtor");
+  };
+
+  BTL_DONT_INLINE double eval_mflops(int size, bool silent = false)
+  {
+    Action action(size);
+
+    /* Find best _nb_calc_ */
+    int bcast_receive, iZERO = 0, iONE = 1;
+    igebr2d_(&context, "A", " ", &iONE, &iONE, &bcast_receive, &iONE, &iZERO, &iZERO);
+    while (bcast_receive > 0) {
+      _nb_calc = bcast_receive;
+      action.initialize();
+      time_calculate(action);
+      igebr2d_(&context, "A", " ", &iONE, &iONE, &bcast_receive, &iONE, &iZERO, &iZERO);
+    }
+    int tries = -bcast_receive;
+
+    /* Optimize */
+    for (int i = 1; i < tries; ++i) {
+      Action _action(size);
+      _action.initialize();
+      time_calculate(_action);
+    }
+
+    /* Check */
+    int do_check;
+    igebr2d_(&context, "A", " ", &iONE, &iONE, &do_check, &iONE, &iZERO, &iZERO);
+    if (do_check > 0) {
+      action.initialize();
+      action.calculate();
+      action.check_result();
+    }
+
+    /* Return a void value */
+    return 0.;
+  }
+
+  BTL_DONT_INLINE void time_calculate(Action & action)
+  {
+    // no need for time measurement
+    action.calculate();
+    for (int i = 0; i < _nb_calc; ++i)
+      action.calculate();
+  }
+
+  unsigned long long get_nb_calc()
+  {
+    return _nb_calc;
+  }
+
+
+private:
+  int context;
+  unsigned long long _nb_calc;
+};
+
+#endif //_PORTABLE_PERF_ANALYZER_NODE_HH

diff --git a/btl/generic_bench/timers/distributed_perf_analyzer_root.hh b/btl/generic_bench/timers/distributed_perf_analyzer_root.hh
new file mode 100644
index 0000000..ca59738
--- /dev/null
+++ b/btl/generic_bench/timers/distributed_perf_analyzer_root.hh
@@ -0,0 +1,94 @@
+#ifndef _PORTABLE_PERF_ANALYZER_ROOT_HH
+#define _PORTABLE_PERF_ANALYZER_ROOT_HH
+
+#include "utilities.h"
+#include "timers/portable_timer.hh"
+#include "blacs.h"
+
+template <class Action>
+class Distributed_Perf_Analyzer_Root{
+public:
+  Distributed_Perf_Analyzer_Root( ):_nb_calc(0), m_time_action(0), _chronos(){
+    MESSAGE("Distributed_Perf_Analyzer_Root Ctor");
+    int temp, what = 0;
+    blacs_get_(&temp, &what, &context);
+  };
+  Distributed_Perf_Analyzer_Root( const Distributed_Perf_Analyzer_Root & ){
+    INFOS("Copy Ctor not implemented");
+    exit(0);
+  };
+  ~Distributed_Perf_Analyzer_Root(){
+    MESSAGE("Distributed_Perf_Analyzer_Root Dtor");
+  };
+
+  BTL_DONT_INLINE double eval_mflops(int size, bool silent = false)
+  {
+    Action action(size);
+    m_time_action = 0;
+    _nb_calc = 0;
+
+    /* Find best _nb_calc_ */
+    int bcast_send = _nb_calc;
+    int iONE = 1;
+    while (m_time_action < MIN_TIME) {
+      _nb_calc = _nb_calc ? 2*_nb_calc : 1;
+      bcast_send = _nb_calc;
+      igebs2d_(&context, "A", " ", &iONE, &iONE, &bcast_send, &iONE);
+      action.initialize();
+      m_time_action = time_calculate(action);
+    }
+    int tries = BtlConfig::Instance.tries;
+    bcast_send = -tries;
+    igebs2d_(&context, "A", " ", &iONE, &iONE, &bcast_send, &iONE);
+
+    /* Optimize */
+    for (int i = 1; i < tries; ++i) {
+      Action _action(size);
+      if (!silent)
+        std::cout << " " << _action.nb_op_base()*_nb_calc/(m_time_action*1e6) << " ";
+      _action.initialize();
+      m_time_action = std::min(m_time_action, time_calculate(_action));
+    }
+    double time_action = m_time_action / (double(_nb_calc));
+
+    /* Check */
+    int do_check = (BtlConfig::Instance.checkResults && size<128) ? 1 : 0;
+    igebs2d_(&context, "A", " ", &iONE, &iONE, &do_check, &iONE);
+    if (do_check > 0) {
+      action.initialize();
+      action.calculate();
+      action.check_result();
+    }
+
+    return action.nb_op_base()/(time_action*1e6);
+  }
+
+  BTL_DONT_INLINE double time_calculate(Action & action)
+  {
+    // time measurement
+    action.calculate();
+    _chronos.start();
+    for (int ii=0; ii<_nb_calc; ii++)
+    {
+      action.calculate();
+    }
+    _chronos.stop();
+    return _chronos.user_time();
+  }
+
+  unsigned long long get_nb_calc()
+  {
+    return _nb_calc;
+  }
+
+
+private:
+  int context;
+  unsigned long long _nb_calc;
+  double m_time_action;
+  Portable_Timer _chronos;
+
+};
+
+#endif //_PORTABLE_PERF_ANALYZER_ROOT_HH
+

diff --git a/btl/generic_bench/timers/portable_perf_analyzer.hh b/btl/generic_bench/timers/portable_perf_analyzer.hh
index 161992f..a8c261f 100644
--- a/btl/generic_bench/timers/portable_perf_analyzer.hh
+++ b/btl/generic_bench/timers/portable_perf_analyzer.hh
@@ -42,12 +42,8 @@ public:
   {
     Action action(size);
 
-//     action.initialize();
-//     time_action = time_calculate(action);
-    while (m_time_action < MIN_TIME)
-    {
-      if(_nb_calc==0) _nb_calc = 1;
-      else            _nb_calc *= 2;
+    while (m_time_action < MIN_TIME) {
+      _nb_calc = _nb_calc ? 2*_nb_calc : 1;
       action.initialize();
       m_time_action = time_calculate(action);
     }
@@ -79,7 +75,7 @@ public:
     // time measurement
     action.calculate();
     _chronos.start();
-    for (int ii=0;ii<_nb_calc;ii++)
+    for (int ii=0; ii<_nb_calc; ii++)
     {
       action.calculate();
     }

diff --git a/btl/libs/PBLAS/main.cpp b/btl/libs/PBLAS/main.cpp
index 888f123..bca245c 100644
--- a/btl/libs/PBLAS/main.cpp
+++ b/btl/libs/PBLAS/main.cpp
@@ -3,7 +3,7 @@
 #include "bench.hh"
 
 #include <iostream>
-using namespace std;
+//using namespace std;
 
 #include "pblas_interface.hh"
 #include "action_parallel_matrix_vector_product.hh"
@@ -22,7 +22,7 @@ int main(int argc, char **argv)
   blacs_gridinit_(&context, "Row-major", &procrows, &proccols);
   bool iamroot = (myid == 0);
 
-  bench<Action_parallel_matrix_vector_product<pblas_interface<REAL_TYPE> > >(MIN_MV,MAX_MV,NB_POINT,!iamroot);
+  distr_bench<Action_parallel_matrix_vector_product<pblas_interface<REAL_TYPE> > >(10,MAX_MV,NB_POINT,!iamroot);
 
 //  blacs_exit_(&iZERO);
   MPI_Finalize();



^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2011-07-18  0:41 UTC | newest]

Thread overview: (only message) (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-07-18  0:41 [gentoo-commits] proj/auto-numerical-bench:unstable commit in: btl/generic_bench/, btl/actions/, btl/generic_bench/timers/, btl/libs/PBLAS/ Andrea Arteaga

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox