From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from lists.gentoo.org (pigeon.gentoo.org [208.92.234.80]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits)) (No client certificate requested) by finch.gentoo.org (Postfix) with ESMTPS id 47AB615812D for ; Sun, 05 Jan 2025 21:50:07 +0000 (UTC) Received: from pigeon.gentoo.org (localhost [127.0.0.1]) by pigeon.gentoo.org (Postfix) with SMTP id D25D6E0819; Sun, 05 Jan 2025 21:49:59 +0000 (UTC) Received: from smtp.gentoo.org (woodpecker.gentoo.org [140.211.166.183]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits)) (No client certificate requested) by pigeon.gentoo.org (Postfix) with ESMTPS id A0E2BE0819 for ; Sun, 05 Jan 2025 21:49:59 +0000 (UTC) Received: from oystercatcher.gentoo.org (oystercatcher.gentoo.org [148.251.78.52]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature RSA-PSS (4096 bits)) (No client certificate requested) by smtp.gentoo.org (Postfix) with ESMTPS id A2CDB34133B for ; Sun, 05 Jan 2025 21:49:58 +0000 (UTC) Received: from localhost.localdomain (localhost [IPv6:::1]) by oystercatcher.gentoo.org (Postfix) with ESMTP id C682A1A9B for ; Sun, 05 Jan 2025 21:49:54 +0000 (UTC) From: "Sam James" To: gentoo-commits@lists.gentoo.org Content-Transfer-Encoding: 8bit Content-type: text/plain; charset=UTF-8 Reply-To: gentoo-dev@lists.gentoo.org, "Sam James" Message-ID: <1736113601.8f2f5e1cf90b1b0d0d418355e16bf7e967df3482.sam@gentoo> Subject: [gentoo-commits] repo/gentoo:master commit in: dev-libs/rccl/, dev-libs/rccl/files/ X-VCS-Repository: repo/gentoo X-VCS-Files: dev-libs/rccl/Manifest dev-libs/rccl/files/rccl-6.3.0-headers-fix.patch dev-libs/rccl/files/rccl-6.3.0-same-rank-sendrecv.patch dev-libs/rccl/rccl-6.3.0.ebuild X-VCS-Directories: dev-libs/rccl/ dev-libs/rccl/files/ X-VCS-Committer: sam X-VCS-Committer-Name: Sam James X-VCS-Revision: 8f2f5e1cf90b1b0d0d418355e16bf7e967df3482 X-VCS-Branch: master Date: Sun, 05 Jan 2025 21:49:54 +0000 (UTC) Precedence: bulk List-Post: List-Help: List-Unsubscribe: List-Subscribe: List-Id: Gentoo Linux mail X-BeenThere: gentoo-commits@lists.gentoo.org X-Auto-Response-Suppress: DR, RN, NRN, OOF, AutoReply X-Archives-Salt: 10215f71-0806-4681-aa35-4171750fe968 X-Archives-Hash: 354aa469e415fbdc61d316a18cf10bdb commit: 8f2f5e1cf90b1b0d0d418355e16bf7e967df3482 Author: Sv. Lockal gmail com> AuthorDate: Mon Dec 9 21:10:54 2024 +0000 Commit: Sam James gentoo org> CommitDate: Sun Jan 5 21:46:41 2025 +0000 URL: https://gitweb.gentoo.org/repo/gentoo.git/commit/?id=8f2f5e1c dev-libs/rccl: add 6.3.0 Signed-off-by: Sv. Lockal gmail.com> Signed-off-by: Sam James gentoo.org> dev-libs/rccl/Manifest | 1 + dev-libs/rccl/files/rccl-6.3.0-headers-fix.patch | 12 + .../rccl/files/rccl-6.3.0-same-rank-sendrecv.patch | 250 +++++++++++++++++++++ dev-libs/rccl/rccl-6.3.0.ebuild | 75 +++++++ 4 files changed, 338 insertions(+) diff --git a/dev-libs/rccl/Manifest b/dev-libs/rccl/Manifest index 8ce9b421344b..a40a8eb250b1 100644 --- a/dev-libs/rccl/Manifest +++ b/dev-libs/rccl/Manifest @@ -1,2 +1,3 @@ DIST rccl-5.7.1.tar.gz 1425561 BLAKE2B 852c111ad806d5c99f48b3c65c8cf37315c68b969f9544bfa14c1faf1d5557edcc57cdc21705ced6ded4a0288d42b1076e65fb67b3f89b4fa78cfba9d317b23e SHA512 5913b8ff67fa787714713b7d5b571374898be740d56c77db9f04fe7a3e6ca74023fa930a3494d8a6f984ac9e68ee318343835e110049d08700fe773376618af4 DIST rccl-6.1.1.tar.gz 1679144 BLAKE2B 371d64691dc74f875c49e14df8f3f2d8b9c607376e6c5a889bd2bdb50607e88715d6d75ffed4ba3184a5b9b241cb37b8501e927a5f495632212909e410102490 SHA512 6c6376dd822182bcf28f573c0f3b5c7e52f94f4b670ee7c88519232f51b443d52cd37cbe6c41b5b6e9cb0b93c1124246a989f6e6a2ae74935134135585118002 +DIST rccl-6.3.0.tar.gz 1828647 BLAKE2B 8c312fc51e7d600bb62fa059e1af53e153955b79b2ba2e8a6b6b52228b9217b7df6dc815c3a48c0800aaa9387f645070e079d04e99c0e8ebdfe41d5ebe0bda06 SHA512 a068b4a21786176638d108c8c85d5e5a8b0413335b555c2602f2a2e0b9f291f6872dbf68fbb5a17a6a0af9d9b5a90b1b37cce63b655a867b68fc9e20d49931ea diff --git a/dev-libs/rccl/files/rccl-6.3.0-headers-fix.patch b/dev-libs/rccl/files/rccl-6.3.0-headers-fix.patch new file mode 100644 index 000000000000..297627819f2c --- /dev/null +++ b/dev-libs/rccl/files/rccl-6.3.0-headers-fix.patch @@ -0,0 +1,12 @@ +gtest 1.14 included iomanip, gtest 1.15 does not anymore. +Upstream bug: https://github.com/ROCm/rccl/issues/1455 +--- a/test/common/TestBed.cpp ++++ b/test/common/TestBed.cpp +@@ -4,6 +4,7 @@ + * See LICENSE.txt for license information + ************************************************************************/ + #include ++#include + #include "TestBed.hpp" + #include + diff --git a/dev-libs/rccl/files/rccl-6.3.0-same-rank-sendrecv.patch b/dev-libs/rccl/files/rccl-6.3.0-same-rank-sendrecv.patch new file mode 100644 index 000000000000..435d6ac57b0f --- /dev/null +++ b/dev-libs/rccl/files/rccl-6.3.0-same-rank-sendrecv.patch @@ -0,0 +1,250 @@ +Enable UT sendrecv to same rank. Fixes test failure. +Backports commit: https://github.com/ROCm/rccl/commit/fd9924cfe7afbb94b1f157972ba001865481480a +--- a/test/SendRecvTests.cpp ++++ b/test/SendRecvTests.cpp +@@ -16,7 +16,6 @@ namespace RcclUnitTesting + std::vector const numElements = {1048576, 53327, 1024, 0}; + bool const inPlace = false; + bool const useManagedMem = false; +- int const groupCallId = 0; + + OptionalColArgs options; + bool isCorrect = true; +@@ -28,7 +27,10 @@ namespace RcclUnitTesting + int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu; + int totalRanks = numGpus * ranksPerGpu; + int const numProcesses = isMultiProcess ? numGpus : 1; +- testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1); ++ testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), ++ {1,2}, //two group, second group sendrecv to self, has 2 coll ++ testBed.GetNumStreamsPerGroup(1,2), ++ 2); + + for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx) + for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx) +@@ -37,6 +39,8 @@ namespace RcclUnitTesting + for (int recvRank = 0; recvRank < totalRanks; ++recvRank) + { + options.root = recvRank; ++ int groupCallId = sendRank == recvRank; //self sendrecv group has two coll ++ int recvId = sendRank == recvRank; //where recv will be second coll + testBed.SetCollectiveArgs(ncclCollSend, + dataTypes[dataIdx], + numElements[numIdx], +@@ -47,36 +51,46 @@ namespace RcclUnitTesting + sendRank); + if (recvRank == 0) + { +- testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, sendRank); +- testBed.PrepareData(groupCallId, 0, sendRank); +- } +- if (recvRank != sendRank) +- { +- if (testBed.ev.showNames) // Show test names +- INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n", +- isMultiProcess ? "MP" : "SP", +- ncclDataTypeNames[dataTypes[dataIdx]], +- sendRank, +- recvRank, +- numElements[numIdx]); +- +- options.root = sendRank; +- testBed.SetCollectiveArgs(ncclCollRecv, ++ //set up the collArg slot to make sure AllocateMem is called once and correctly ++ testBed.SetCollectiveArgs(ncclCollSend, + dataTypes[dataIdx], + numElements[numIdx], + numElements[numIdx], + options, + 0, +- groupCallId, +- recvRank); +- testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, recvRank); +- testBed.PrepareData(groupCallId, 0, recvRank); +- testBed.ExecuteCollectives({sendRank, recvRank}); +- testBed.ValidateResults(isCorrect, groupCallId, 0, recvRank); +- testBed.DeallocateMem(groupCallId, 0, recvRank); ++ !groupCallId, ++ sendRank); ++ testBed.AllocateMem(inPlace, useManagedMem, 0, 0, sendRank); ++ testBed.PrepareData(0, 0, sendRank); ++ testBed.AllocateMem(inPlace, useManagedMem, 1, 0, sendRank); ++ testBed.PrepareData(1, 0, sendRank); + } ++ ++ if (testBed.ev.showNames) // Show test names ++ INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n", ++ isMultiProcess ? "MP" : "SP", ++ ncclDataTypeNames[dataTypes[dataIdx]], ++ sendRank, ++ recvRank, ++ numElements[numIdx]); ++ options.root = sendRank; ++ ++ testBed.SetCollectiveArgs(ncclCollRecv, ++ dataTypes[dataIdx], ++ numElements[numIdx], ++ numElements[numIdx], ++ options, ++ recvId, ++ groupCallId, ++ recvRank); ++ testBed.AllocateMem(inPlace, useManagedMem, groupCallId, recvId, recvRank); ++ testBed.PrepareData(groupCallId, recvId, recvRank); ++ testBed.ExecuteCollectives({sendRank, recvRank}, groupCallId); ++ testBed.ValidateResults(isCorrect, groupCallId, recvId, recvRank); ++ testBed.DeallocateMem(groupCallId, recvId, recvRank); + } +- testBed.DeallocateMem(groupCallId, 0, sendRank); ++ testBed.DeallocateMem(0, 0, sendRank); ++ testBed.DeallocateMem(1, 0, sendRank); + } + testBed.DestroyComms(); + } +@@ -94,7 +108,6 @@ namespace RcclUnitTesting + bool const inPlace = false; + bool const useManagedMem = false; + bool const userRegistered = true; +- int const groupCallId = 0; + + OptionalColArgs options; + bool isCorrect = true; +@@ -106,7 +119,10 @@ namespace RcclUnitTesting + int ranksPerGpu = rpg == 0 ? 1 : testBed.ev.maxRanksPerGpu; + int totalRanks = numGpus * ranksPerGpu; + int const numProcesses = isMultiProcess ? numGpus : 1; +- testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), 1); ++ testBed.InitComms(TestBed::GetDeviceIdsList(numProcesses, numGpus, ranksPerGpu), ++ {1,2}, //two group, second group sendrecv to self, has 2 coll ++ testBed.GetNumStreamsPerGroup(1,2), ++ 2); + + for (int dataIdx = 0; dataIdx < dataTypes.size() && isCorrect; ++dataIdx) + for (int numIdx = 0; numIdx < numElements.size() && isCorrect; ++numIdx) +@@ -115,6 +131,8 @@ namespace RcclUnitTesting + for (int recvRank = 0; recvRank < totalRanks; ++recvRank) + { + options.root = recvRank; ++ int groupCallId = sendRank == recvRank; ++ int recvId = sendRank == recvRank; + testBed.SetCollectiveArgs(ncclCollSend, + dataTypes[dataIdx], + numElements[numIdx], +@@ -125,36 +143,45 @@ namespace RcclUnitTesting + sendRank); + if (recvRank == 0) + { +- testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, sendRank, userRegistered); +- testBed.PrepareData(groupCallId, 0, sendRank); +- } +- if (recvRank != sendRank) +- { +- if (testBed.ev.showNames) // Show test names +- INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n", +- isMultiProcess ? "MP" : "SP", +- ncclDataTypeNames[dataTypes[dataIdx]], +- sendRank, +- recvRank, +- numElements[numIdx]); +- +- options.root = sendRank; +- testBed.SetCollectiveArgs(ncclCollRecv, ++ testBed.SetCollectiveArgs(ncclCollSend, + dataTypes[dataIdx], + numElements[numIdx], + numElements[numIdx], + options, + 0, +- groupCallId, +- recvRank); +- testBed.AllocateMem(inPlace, useManagedMem, groupCallId, 0, recvRank, userRegistered); +- testBed.PrepareData(groupCallId, 0, recvRank); +- testBed.ExecuteCollectives({sendRank, recvRank}); +- testBed.ValidateResults(isCorrect, groupCallId, 0, recvRank); +- testBed.DeallocateMem(groupCallId, 0, recvRank); ++ !groupCallId, ++ sendRank); ++ testBed.AllocateMem(inPlace, useManagedMem, 0, 0, sendRank, userRegistered); ++ testBed.PrepareData(0, 0, sendRank); ++ testBed.AllocateMem(inPlace, useManagedMem, 1, 0, sendRank, userRegistered); ++ testBed.PrepareData(1, 0, sendRank); + } ++ ++ if (testBed.ev.showNames) // Show test names ++ INFO("%s Datatype: %s SendReceive test Rank %d -> Rank %d for %d Elements\n", ++ isMultiProcess ? "MP" : "SP", ++ ncclDataTypeNames[dataTypes[dataIdx]], ++ sendRank, ++ recvRank, ++ numElements[numIdx]); ++ ++ options.root = sendRank; ++ testBed.SetCollectiveArgs(ncclCollRecv, ++ dataTypes[dataIdx], ++ numElements[numIdx], ++ numElements[numIdx], ++ options, ++ recvId, ++ groupCallId, ++ recvRank); ++ testBed.AllocateMem(inPlace, useManagedMem, groupCallId, recvId, recvRank, userRegistered); ++ testBed.PrepareData(groupCallId, recvId, recvRank); ++ testBed.ExecuteCollectives({sendRank, recvRank}, groupCallId); ++ testBed.ValidateResults(isCorrect, groupCallId, recvId, recvRank); ++ testBed.DeallocateMem(groupCallId, recvId, recvRank); + } +- testBed.DeallocateMem(groupCallId, 0, sendRank); ++ testBed.DeallocateMem(0, 0, sendRank); ++ testBed.DeallocateMem(1, 0, sendRank); + } + testBed.DestroyComms(); + } +--- a/test/common/TestBedChild.cpp ++++ b/test/common/TestBedChild.cpp +@@ -395,6 +395,8 @@ namespace RcclUnitTesting + { + CollectiveArgs& collArg = this->collArgs[groupId][localRank][collIdx]; + CHECK_CALL(collArg.AllocateMem(inPlace, useManagedMem, userRegistered)); ++ if (collArg.userRegistered && (collArg.funcType == ncclCollSend || collArg.funcType == ncclCollRecv)) ++ CHILD_NCCL_CALL(ncclCommRegister(this->comms[localRank], collArg.inputGpu.ptr, collArg.numInputBytesAllocated, &(collArg.commRegHandle)),"ncclCommRegister"); + if (this->verbose) INFO("Rank %d on child %d allocates memory for collective %d in group %d on device %d (%s,%s,%s) Input: %p Output %p\n", + globalRank, this->childId, collIdx, groupId, this->deviceIds[localRank], + inPlace ? "in-place" : "out-of-place", +@@ -646,8 +648,6 @@ namespace RcclUnitTesting + "ncclAllToAllv"); + break; + case ncclCollSend: +- if (collArg.userRegistered) +- CHILD_NCCL_CALL_RANK(errCode, ncclCommRegister(this->comms[localRank], collArg.inputGpu.ptr, collArg.numInputBytesAllocated, &(collArg.commRegHandle)),"ncclCommRegister"); + CHILD_NCCL_CALL_RANK(errCode, ncclSend( + collArg.inputGpu.ptr, + collArg.numInputElements, +@@ -658,8 +658,6 @@ namespace RcclUnitTesting + "ncclSend"); + break; + case ncclCollRecv: +- if (collArg.userRegistered) +- CHILD_NCCL_CALL_RANK(errCode, ncclCommRegister(this->comms[localRank], collArg.outputGpu.ptr, collArg.numOutputBytesAllocated, &(collArg.commRegHandle)), "ncclCommRegister"); + CHILD_NCCL_CALL_RANK(errCode, ncclRecv( + collArg.outputGpu.ptr, + collArg.numOutputElements, +@@ -891,8 +889,6 @@ namespace RcclUnitTesting + for (int collIdx = 0; collIdx < collArgs[groupId][localRank].size(); ++collIdx) + { + CollectiveArgs& collArg = this->collArgs[groupId][localRank][collIdx]; +- if (collArg.userRegistered && (collArg.funcType == ncclCollSend || collArg.funcType == ncclCollRecv)) +- CHILD_NCCL_CALL(ncclCommDeregister(this->comms[localRank], collArg.commRegHandle), "ncclCommDeregister"); + if (collId == -1 || collId == collIdx) + { + if (this->verbose) +@@ -900,6 +896,10 @@ namespace RcclUnitTesting + INFO("Child %d release memory for collective %d in group %d (Input: %p Output %p\n", + this->childId, collIdx, groupId, collArg.inputGpu.ptr, collArg.outputGpu.ptr); + } ++ if (collArg.userRegistered && (collArg.funcType == ncclCollSend || collArg.funcType == ncclCollRecv)) ++ { ++ CHILD_NCCL_CALL(ncclCommDeregister(this->comms[localRank], collArg.commRegHandle), "ncclCommDeregister"); ++ } + + CHECK_CALL(collArg.DeallocateMem()); + } diff --git a/dev-libs/rccl/rccl-6.3.0.ebuild b/dev-libs/rccl/rccl-6.3.0.ebuild new file mode 100644 index 000000000000..d610f7eb139c --- /dev/null +++ b/dev-libs/rccl/rccl-6.3.0.ebuild @@ -0,0 +1,75 @@ +# Copyright 1999-2024 Gentoo Authors +# Distributed under the terms of the GNU General Public License v2 + +EAPI=8 + +ROCM_VERSION=${PV} + +inherit cmake edo rocm flag-o-matic + +DESCRIPTION="ROCm Communication Collectives Library (RCCL)" +HOMEPAGE="https://github.com/ROCm/rccl" +SRC_URI="https://github.com/ROCm/rccl/archive/rocm-${PV}.tar.gz -> rccl-${PV}.tar.gz" +S="${WORKDIR}/rccl-rocm-${PV}" + +LICENSE="BSD" +SLOT="0/$(ver_cut 1-2)" +KEYWORDS="~amd64" +IUSE="test" + +RDEPEND=" + dev-util/hip:${SLOT} + dev-util/rocm-smi:${SLOT}" +DEPEND="${RDEPEND} + sys-libs/binutils-libs" +BDEPEND=" + >=dev-build/cmake-3.22 + >=dev-build/rocm-cmake-5.7.1 + dev-util/hipify-clang:${SLOT} + test? ( dev-cpp/gtest )" + +RESTRICT="!test? ( test )" + +PATCHES=( + "${FILESDIR}/${PN}-6.0.2-fix-version-check.patch" + "${FILESDIR}/${PN}-6.3.0-same-rank-sendrecv.patch" + "${FILESDIR}/${PN}-6.3.0-headers-fix.patch" +) + +src_prepare() { + cmake_src_prepare + + # https://reviews.llvm.org/D69582 - clang does not support parallel jobs + sed '/parallel-jobs/d' -i CMakeLists.txt || die + + # complete fix-version-check patch + sed "s/@rocm_version@/${PV}/" -i CMakeLists.txt || die + + # don't install tests + sed "/rocm_install(TARGETS rccl-UnitTests/d" -i test/CMakeLists.txt || die +} + +src_configure() { + rocm_use_hipcc + + # lto flags make compilation fail with "undefined hidden symbol" + filter-lto + + local mycmakeargs=( + -DCMAKE_SKIP_RPATH=ON + -DAMDGPU_TARGETS="$(get_amdgpu_flags)" + -DBUILD_TESTS=$(usex test ON OFF) + -DROCM_SYMLINK_LIBS=OFF + -DROCM_PATH="${EPREFIX}/usr" + -DRCCL_ROCPROFILER_REGISTER=OFF + -Wno-dev + ) + + cmake_src_configure +} + +src_test() { + check_amdgpu + cd "${BUILD_DIR}" || die + LD_LIBRARY_PATH="${BUILD_DIR}" edob test/rccl-UnitTests +}