From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from pigeon.gentoo.org ([208.92.234.80] helo=lists.gentoo.org) by finch.gentoo.org with esmtp (Exim 4.60) (envelope-from ) id 1Qibtd-0002ig-RP for garchives@archives.gentoo.org; Mon, 18 Jul 2011 00:41:18 +0000 Received: from pigeon.gentoo.org (localhost [127.0.0.1]) by pigeon.gentoo.org (Postfix) with SMTP id 9176821C053; Mon, 18 Jul 2011 00:41:07 +0000 (UTC) Received: from smtp.gentoo.org (smtp.gentoo.org [140.211.166.183]) by pigeon.gentoo.org (Postfix) with ESMTP id 3387C21C053 for ; Mon, 18 Jul 2011 00:41:06 +0000 (UTC) Received: from pelican.gentoo.org (unknown [66.219.59.40]) (using TLSv1 with cipher ADH-AES256-SHA (256/256 bits)) (No client certificate requested) by smtp.gentoo.org (Postfix) with ESMTPS id 676401B400B for ; Mon, 18 Jul 2011 00:41:06 +0000 (UTC) Received: from localhost.localdomain (localhost [127.0.0.1]) by pelican.gentoo.org (Postfix) with ESMTP id B806E8003D for ; Mon, 18 Jul 2011 00:41:05 +0000 (UTC) From: "Andrea Arteaga" To: gentoo-commits@lists.gentoo.org Content-type: text/plain; charset=UTF-8 Reply-To: gentoo-dev@lists.gentoo.org, "Andrea Arteaga" Message-ID: <63a0b1731ec14427c3f7e5dfbb63e5e6724e69f9.spiros@gentoo> Subject: [gentoo-commits] proj/auto-numerical-bench:unstable commit in: btl/generic_bench/, btl/actions/, btl/generic_bench/timers/, btl/libs/PBLAS/ X-VCS-Repository: proj/auto-numerical-bench X-VCS-Files: btl/actions/action_parallel_matrix_vector_product.hh btl/generic_bench/bench.hh btl/generic_bench/timers/distributed_perf_analyzer_node.hh btl/generic_bench/timers/distributed_perf_analyzer_root.hh btl/generic_bench/timers/portable_perf_analyzer.hh btl/libs/PBLAS/main.cpp X-VCS-Directories: btl/generic_bench/ btl/actions/ btl/generic_bench/timers/ btl/libs/PBLAS/ X-VCS-Committer: spiros X-VCS-Committer-Name: Andrea Arteaga X-VCS-Revision: 63a0b1731ec14427c3f7e5dfbb63e5e6724e69f9 Date: Mon, 18 Jul 2011 00:41:05 +0000 (UTC) Precedence: bulk List-Post: List-Help: List-Unsubscribe: List-Subscribe: List-Id: Gentoo Linux mail X-BeenThere: gentoo-commits@lists.gentoo.org Content-Transfer-Encoding: quoted-printable X-Archives-Salt: X-Archives-Hash: 876831ab1e050d7e56d9b56a24cb8dd2 commit: 63a0b1731ec14427c3f7e5dfbb63e5e6724e69f9 Author: spiros gmail com> AuthorDate: Mon Jul 18 00:36:22 2011 +0000 Commit: Andrea Arteaga gmail com> CommitDate: Mon Jul 18 00:36:22 2011 +0000 URL: http://git.overlays.gentoo.org/gitweb/?p=3Dproj/auto-numerica= l-bench.git;a=3Dcommit;h=3D63a0b173 Much work on distributed-memory BTL. --- .../action_parallel_matrix_vector_product.hh | 11 ++- btl/generic_bench/bench.hh | 112 ++++----------= ------ .../timers/distributed_perf_analyzer_node.hh | 78 ++++++++++++++ .../timers/distributed_perf_analyzer_root.hh | 94 ++++++++++++++= ++ btl/generic_bench/timers/portable_perf_analyzer.hh | 10 +-- btl/libs/PBLAS/main.cpp | 4 +- 6 files changed, 204 insertions(+), 105 deletions(-) diff --git a/btl/actions/action_parallel_matrix_vector_product.hh b/btl/a= ctions/action_parallel_matrix_vector_product.hh index c166e01..07886a2 100644 --- a/btl/actions/action_parallel_matrix_vector_product.hh +++ b/btl/actions/action_parallel_matrix_vector_product.hh @@ -22,6 +22,7 @@ #include "utilities.h" #include "STL_interface.hh" #include +#include #include "init/init_function.hh" #include "init/init_vector.hh" #include "init/init_matrix.hh" @@ -76,9 +77,13 @@ public : // Descinit int context =3D Interface::context(); int info; - descinit_(descA, &GlobalRows, &GlobalCols, &BlockRows, &BlockCols, &= iZERO, &iZERO, &context, &LocalRows, &info); - descinit_(descX, &GlobalCols, &iONE, &BlockRows, &BlockCols, &= iZERO, &iZERO, &context, &LocalXRows, &info); - descinit_(descY, &GlobalRows, &iONE, &BlockRows, &BlockCols, &= iZERO, &iZERO, &context, &LocalYRows, &info); + int LDA, LDX, LDY; + LDA =3D std::max(1, LocalRows); + LDX =3D std::max(1, LocalXRows); + LDY =3D std::max(1, LocalYRows); + descinit_(descA, &GlobalRows, &GlobalCols, &BlockRows, &BlockCols, &= iZERO, &iZERO, &context, &LDA, &info); + descinit_(descX, &GlobalCols, &iONE, &BlockRows, &BlockCols, &= iZERO, &iZERO, &context, &LDX, &info); + descinit_(descY, &GlobalRows, &iONE, &BlockRows, &BlockCols, &= iZERO, &iZERO, &context, &LDY, &info); } =20 // invalidate copy ctor diff --git a/btl/generic_bench/bench.hh b/btl/generic_bench/bench.hh index d9906a4..2a5ba36 100644 --- a/btl/generic_bench/bench.hh +++ b/btl/generic_bench/bench.hh @@ -29,21 +29,19 @@ #include #include #include "timers/portable_perf_analyzer.hh" +#include "timers/distributed_perf_analyzer_root.hh" +#include "timers/distributed_perf_analyzer_node.hh" // #include "timers/mixed_perf_analyzer.hh" // #include "timers/x86_perf_analyzer.hh" // #include "timers/STL_perf_analyzer.hh" #ifdef HAVE_MKL extern "C" void cblas_saxpy(const int, const float, const float*, const = int, float *, const int); #endif -using namespace std; =20 template class Perf_Analyzer, class Action> BTL_DONT_INLINE void bench( int size_min, int size_max, int nb_point, bo= ol silent =3D false ) { - if (BtlConfig::skipAction(Action::name())) - return; - - string filename=3D"bench_"+Action::name()+".dat"; + std::string filename =3D "bench_"+Action::name()+".dat"; =20 if (!silent) { INFOS("starting " < oldSizes; - std::vector oldFlops; - bool hasOldResults =3D read_xy_file(filename, oldSizes, oldFlops, true= ); - int oldi =3D oldSizes.size() - 1; - // loop on matrix size Perf_Analyzer perf_action; - for (int i=3Dnb_point-1;i>=3D0;i--) + for (int i=3Dnb_point-1; i>=3D0; i--) { - //INFOS("size=3D" <=3D0 && oldSizes[oldi]>tab_sizes[i]) - --oldi; - if (oldi>=3D0 && oldSizes[oldi]=3D=3Dtab_sizes[i] && !silent) - { - if (oldFlops[oldi] "; - else - std::cout << "\t < "; - std::cout << oldFlops[oldi]; - } - --oldi; - } if (!silent) - std::cout << " MFlops (" << nb_point-i << "/" << nb_point << ")" = << std::endl; - } - - if (!BtlConfig::Instance.overwriteResults) - { - if (hasOldResults) - { - // merge the two data - std::vector newSizes; - std::vector newFlops; - int i=3D0; - int j=3D0; - while (i -BTL_DONT_INLINE void bench( int size_min, int size_max, int nb_point, bo= ol silent ){ - - // if the rdtsc is not available : +BTL_DONT_INLINE void bench( int size_min, int size_max, int nb_point, bo= ol silent =3D false) +{ bench(size_min,size_max,nb_point,silent= ); - // if the rdtsc is available : -// bench(size_min,size_max,nb_point); - +} =20 - // Only for small problem size. Otherwize it will be too long -// bench(size_min,size_max,nb_point); -// bench(size_min,size_max,nb_point); +// distributed Perf Analyzer =20 +template +BTL_DONT_INLINE void distr_bench( int size_min, int size_max, int nb_poi= nt, bool silent =3D false) +{ + int myid, nproc; + blacs_pinfo_(&myid, &nproc); + if (myid) + bench(size_min, size_max, nb= _point, silent); + else + bench(size_min, size_max, nb= _point, silent); } =20 #endif diff --git a/btl/generic_bench/timers/distributed_perf_analyzer_node.hh b= /btl/generic_bench/timers/distributed_perf_analyzer_node.hh new file mode 100644 index 0000000..7399d30 --- /dev/null +++ b/btl/generic_bench/timers/distributed_perf_analyzer_node.hh @@ -0,0 +1,78 @@ +#ifndef _PORTABLE_PERF_ANALYZER_NODE_HH +#define _PORTABLE_PERF_ANALYZER_NODE_HH + +#include "utilities.h" +#include "timers/portable_timer.hh" +#include "blacs.h" + +template +class Distributed_Perf_Analyzer_Node{ +public: + Distributed_Perf_Analyzer_Node( ):_nb_calc(0){ + MESSAGE("Distributed_Perf_Analyzer_Node Ctor"); + int temp, what =3D 0; + blacs_get_(&temp, &what, &context); + }; + Distributed_Perf_Analyzer_Node( const Distributed_Perf_Analyzer_Node& = ){ + INFOS("Copy Ctor not implemented"); + exit(0); + }; + ~Distributed_Perf_Analyzer_Node(){ + MESSAGE("Distributed_Perf_Analyzer_Node Dtor"); + }; + + BTL_DONT_INLINE double eval_mflops(int size, bool silent =3D false) + { + Action action(size); + + /* Find best _nb_calc_ */ + int bcast_receive, iZERO =3D 0, iONE =3D 1; + igebr2d_(&context, "A", " ", &iONE, &iONE, &bcast_receive, &iONE, &i= ZERO, &iZERO); + while (bcast_receive > 0) { + _nb_calc =3D bcast_receive; + action.initialize(); + time_calculate(action); + igebr2d_(&context, "A", " ", &iONE, &iONE, &bcast_receive, &iONE, = &iZERO, &iZERO); + } + int tries =3D -bcast_receive; + + /* Optimize */ + for (int i =3D 1; i < tries; ++i) { + Action _action(size); + _action.initialize(); + time_calculate(_action); + } + + /* Check */ + int do_check; + igebr2d_(&context, "A", " ", &iONE, &iONE, &do_check, &iONE, &iZERO,= &iZERO); + if (do_check > 0) { + action.initialize(); + action.calculate(); + action.check_result(); + } + + /* Return a void value */ + return 0.; + } + + BTL_DONT_INLINE void time_calculate(Action & action) + { + // no need for time measurement + action.calculate(); + for (int i =3D 0; i < _nb_calc; ++i) + action.calculate(); + } + + unsigned long long get_nb_calc() + { + return _nb_calc; + } + + +private: + int context; + unsigned long long _nb_calc; +}; + +#endif //_PORTABLE_PERF_ANALYZER_NODE_HH diff --git a/btl/generic_bench/timers/distributed_perf_analyzer_root.hh b= /btl/generic_bench/timers/distributed_perf_analyzer_root.hh new file mode 100644 index 0000000..ca59738 --- /dev/null +++ b/btl/generic_bench/timers/distributed_perf_analyzer_root.hh @@ -0,0 +1,94 @@ +#ifndef _PORTABLE_PERF_ANALYZER_ROOT_HH +#define _PORTABLE_PERF_ANALYZER_ROOT_HH + +#include "utilities.h" +#include "timers/portable_timer.hh" +#include "blacs.h" + +template +class Distributed_Perf_Analyzer_Root{ +public: + Distributed_Perf_Analyzer_Root( ):_nb_calc(0), m_time_action(0), _chro= nos(){ + MESSAGE("Distributed_Perf_Analyzer_Root Ctor"); + int temp, what =3D 0; + blacs_get_(&temp, &what, &context); + }; + Distributed_Perf_Analyzer_Root( const Distributed_Perf_Analyzer_Root &= ){ + INFOS("Copy Ctor not implemented"); + exit(0); + }; + ~Distributed_Perf_Analyzer_Root(){ + MESSAGE("Distributed_Perf_Analyzer_Root Dtor"); + }; + + BTL_DONT_INLINE double eval_mflops(int size, bool silent =3D false) + { + Action action(size); + m_time_action =3D 0; + _nb_calc =3D 0; + + /* Find best _nb_calc_ */ + int bcast_send =3D _nb_calc; + int iONE =3D 1; + while (m_time_action < MIN_TIME) { + _nb_calc =3D _nb_calc ? 2*_nb_calc : 1; + bcast_send =3D _nb_calc; + igebs2d_(&context, "A", " ", &iONE, &iONE, &bcast_send, &iONE); + action.initialize(); + m_time_action =3D time_calculate(action); + } + int tries =3D BtlConfig::Instance.tries; + bcast_send =3D -tries; + igebs2d_(&context, "A", " ", &iONE, &iONE, &bcast_send, &iONE); + + /* Optimize */ + for (int i =3D 1; i < tries; ++i) { + Action _action(size); + if (!silent) + std::cout << " " << _action.nb_op_base()*_nb_calc/(m_time_action= *1e6) << " "; + _action.initialize(); + m_time_action =3D std::min(m_time_action, time_calculate(_action))= ; + } + double time_action =3D m_time_action / (double(_nb_calc)); + + /* Check */ + int do_check =3D (BtlConfig::Instance.checkResults && size<128) ? 1 = : 0; + igebs2d_(&context, "A", " ", &iONE, &iONE, &do_check, &iONE); + if (do_check > 0) { + action.initialize(); + action.calculate(); + action.check_result(); + } + + return action.nb_op_base()/(time_action*1e6); + } + + BTL_DONT_INLINE double time_calculate(Action & action) + { + // time measurement + action.calculate(); + _chronos.start(); + for (int ii=3D0; ii<_nb_calc; ii++) + { + action.calculate(); + } + _chronos.stop(); + return _chronos.user_time(); + } + + unsigned long long get_nb_calc() + { + return _nb_calc; + } + + +private: + int context; + unsigned long long _nb_calc; + double m_time_action; + Portable_Timer _chronos; + +}; + +#endif //_PORTABLE_PERF_ANALYZER_ROOT_HH + diff --git a/btl/generic_bench/timers/portable_perf_analyzer.hh b/btl/gen= eric_bench/timers/portable_perf_analyzer.hh index 161992f..a8c261f 100644 --- a/btl/generic_bench/timers/portable_perf_analyzer.hh +++ b/btl/generic_bench/timers/portable_perf_analyzer.hh @@ -42,12 +42,8 @@ public: { Action action(size); =20 -// action.initialize(); -// time_action =3D time_calculate(action); - while (m_time_action < MIN_TIME) - { - if(_nb_calc=3D=3D0) _nb_calc =3D 1; - else _nb_calc *=3D 2; + while (m_time_action < MIN_TIME) { + _nb_calc =3D _nb_calc ? 2*_nb_calc : 1; action.initialize(); m_time_action =3D time_calculate(action); } @@ -79,7 +75,7 @@ public: // time measurement action.calculate(); _chronos.start(); - for (int ii=3D0;ii<_nb_calc;ii++) + for (int ii=3D0; ii<_nb_calc; ii++) { action.calculate(); } diff --git a/btl/libs/PBLAS/main.cpp b/btl/libs/PBLAS/main.cpp index 888f123..bca245c 100644 --- a/btl/libs/PBLAS/main.cpp +++ b/btl/libs/PBLAS/main.cpp @@ -3,7 +3,7 @@ #include "bench.hh" =20 #include -using namespace std; +//using namespace std; =20 #include "pblas_interface.hh" #include "action_parallel_matrix_vector_product.hh" @@ -22,7 +22,7 @@ int main(int argc, char **argv) blacs_gridinit_(&context, "Row-major", &procrows, &proccols); bool iamroot =3D (myid =3D=3D 0); =20 - bench= > >(MIN_MV,MAX_MV,NB_POINT,!iamroot); + distr_bench > >(10,MAX_MV,NB_POINT,!iamroot); =20 // blacs_exit_(&iZERO); MPI_Finalize();