diff --git a/.gitignore b/.gitignore index 4d0b5bcc822490504c6e1f7ef47466593a8a25f3..7c053e1b7c0effcad7367aef687bf6032de04084 100644 --- a/.gitignore +++ b/.gitignore @@ -30,3 +30,4 @@ cudacomplex cudasingle amdcpucomplex amdcpusingle +*build/ \ No newline at end of file diff --git a/.gitmodules b/.gitmodules deleted file mode 100644 index 613ca93c3fefc77fb4380b6b915c237182206cb5..0000000000000000000000000000000000000000 --- a/.gitmodules +++ /dev/null @@ -1,4 +0,0 @@ -[submodule "thirdparty/pybind11"] - path = thirdparty/pybind11 - url = https://github.com/pybind/pybind11.git - diff --git a/CMakeLists.txt b/CMakeLists.txt index 9cab794e33ff2a5a4e269ce29a52986331da3bc6..9df0cc7e9d8b211339a758731b574ccc01612297 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.19) +cmake_minimum_required(VERSION 3.20) set(PROJECT_NAME TLRMVM) @@ -35,7 +35,7 @@ set(CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib) option(BUILD_CPU "Build TLR-MVM using cpp" ON) option(BUILD_DPCPP "Build TLR-MVM on intel archs and use intel dpc++." OFF) option(BUILD_CUDA "Build TLR-MVM on NVIDIA gpu and cuda." OFF) # using NVIDIA GPU -# option(BUILD_HIP "Build TLR-MVM on amd gpu and use hip." OFF) # using AMD GPU (AMD is under dev) + option(BUILD_HIP "Build TLR-MVM on amd gpu and use hip." OFF) # using AMD GPU (AMD is under dev) ######################### # BLAS backend selection @@ -190,18 +190,16 @@ endif() # BUILD_CUDA # HIP library ####################### if(BUILD_HIP) - list(APPEND CMAKE_PREFIX_PATH $ENV{HIP_PATH} $ENV{ROCM_PATH} - $ENV{HIP_PATH}/hip $ENV{HIP_PATH}/llvm/lib/clang/14.0.0/lib/linux) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result ") + enable_language(HIP) + find_package(HIP REQUIRED) include(cmake/hiptlrmvm.cmake) add_library(tlrmvmhiplib SHARED ${CPU_HEADERS} ${CPU_SRCS} ${HIP_HEADERS} ${HIP_SRCS}) - find_package(hip REQUIRED) - find_package(hipblas REQUIRED) - set(TLRMVM_LIBS ${TLRMVM_LIBS} hip::device roc::hipblas) +# find_package(hipblas REQUIRED) + set(TLRMVM_LIBS ${TLRMVM_LIBS} hip::device hip::host hipblas) target_include_directories(tlrmvmhiplib PUBLIC ${TLRMVM_INCS}) target_link_libraries(tlrmvmhiplib PUBLIC ${TLRMVM_LIBS}) AddCompileDefinitions(tlrmvmhiplib) - target_compile_definitions(tlrmvmhiplib PUBLIC -D__HIP_PLATFORM_HCC__=1) # for clion search ... +# target_compile_definitions(tlrmvmhiplib PUBLIC -D__HIP_PLATFORM_HCC__=1) # for clion search ... target_compile_definitions(tlrmvmhiplib PUBLIC USE_HIP) install(TARGETS tlrmvmhiplib DESTINATION lib) endif() # BUILD_HIP @@ -212,7 +210,7 @@ endif() # BUILD_HIP ################# if(BUILD_PYTHON AND (BUILD_CUDA OR BUILD_HIP)) # now python is only available for CUDA and HIP build. - add_subdirectory(thirdparty/pybind11) + find_package(pybind11 REQUIRED) add_subdirectory(pytlrmvm) endif() diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..32483c58a65289d12c8f954774f2152290038666 --- /dev/null +++ b/LICENSE @@ -0,0 +1,24 @@ +Copyright (c) 2022, King Abdullah University of Science and Technology +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md index 1de8f157e8d50d5e09f55f4bafa8b4ccc91cc5ad..7f598fb7e1190f26bad1ea03537c8d82624b6518 100644 --- a/README.md +++ b/README.md @@ -105,7 +105,10 @@ Compile and install ## 4. Test You also need to download the dataset to run the experiments. dataset download url: -https://drive.google.com/drive/folders/1_DSyloFjlScXGTlA1_ybJnTne59tUpgR?usp=sharing +1. seismic redatuming dataset +https://zenodo.org/record/6582600 +2. MAVIS AO system matrcies dataset +https://zenodo.org/record/7305622 after download, put it in a seperate folder and set `WORK_ROOT` to that folder. diff --git a/cmake/hiptlrmvm.cmake b/cmake/hiptlrmvm.cmake new file mode 100644 index 0000000000000000000000000000000000000000..33b4d2508b5b79be102be87b2a97aa9c7d83f8d6 --- /dev/null +++ b/cmake/hiptlrmvm.cmake @@ -0,0 +1,21 @@ +include(${PROJECT_SOURCE_DIR}/cmake/tlrmvm.cmake) +set(HIP_SRCS) +set(HIP_HEADERS) + +# Common +file(GLOB TMP ${PROJECT_SOURCE_DIR}/src/common/hip/*.cpp) +list(APPEND HIP_SRCS ${TMP}) +file(GLOB TMP ${PROJECT_SOURCE_DIR}/src/common/hip/*.cu) +list(APPEND HIP_SRCS ${TMP}) +file(GLOB TMP ${PROJECT_SOURCE_DIR}/src/common/hip/*.hpp) +list(APPEND HIP_HEADERS ${TMP}) + +# tlrmvm +file(GLOB TMP ${PROJECT_SOURCE_DIR}/src/tlrmvm/hip/*.cpp) +list(APPEND HIP_SRCS ${TMP}) +file(GLOB TMP ${PROJECT_SOURCE_DIR}/src/tlrmvm/hip/*.cu) +list(APPEND HIP_SRCS ${TMP}) +file(GLOB TMP ${PROJECT_SOURCE_DIR}/src/tlrmvm/hip/*.hpp) +list(APPEND HIP_HEADERS ${TMP}) +file(GLOB TMP ${PROJECT_SOURCE_DIR}/src/tlrmvm/hip/*.cuh) +list(APPEND HIP_HEADERS ${TMP}) \ No newline at end of file diff --git a/pytlrmvm/convertmat2npy.py b/pytlrmvm/convertmat2npy.py deleted file mode 100644 index 8d4a403d442a56f886e975ab73ecdb3e7e18dfcc..0000000000000000000000000000000000000000 --- a/pytlrmvm/convertmat2npy.py +++ /dev/null @@ -1,27 +0,0 @@ -################################################################## -# @copyright (c) 2021- King Abdullah University of Science and -# Technology (KAUST). All rights reserved. -# -# Author: Yuxi Hong, 2021.10.27 -# Description: Convert matlab .mat file to numpy npy. -################################################################## -import os -from os.path import join -from scipy.io import loadmat -import numpy as np -import pickle -import argparse -import time - -parser = argparse.ArgumentParser() -parser.add_argument('--filename', type=str, help='matfilename') -parser.add_argument('--mat_root', type=str, help='matfile dir') -parser.add_argument('--work_root', type=str, help='workspace dir') -args = parser.parse_args() - -matname = join(args.mat_root, args.filename + '.mat') -npyname = join(args.work_root, args.filename + '.npy') -work_root = args.work_root -A = loadmat(matname)['Rfreq'] -with open(npyname, 'wb') as f: - np.save(f, A) diff --git a/pytlrmvm/generateseismicinput.py b/pytlrmvm/generateseismicinput.py deleted file mode 100644 index f388c7c9665a8b4e3a50c86bcf393121e0d21a3c..0000000000000000000000000000000000000000 --- a/pytlrmvm/generateseismicinput.py +++ /dev/null @@ -1,77 +0,0 @@ -import numpy as np -import sys -import os -import argparse -import time - -parser = argparse.ArgumentParser() -parser.add_argument('--TLRMVM_ROOT', type=str, help='installation dir') -parser.add_argument('--WORK_ROOT', type=str, help='workspace dir') -parser.add_argument('--nb', type=int, help='nb') -parser.add_argument('--error_threshold', type=str,help='error threshold') -parser.add_argument('--problemname', type=str, help='problem name') -parser.add_argument('--datatype', type=str, help='datatype of dataset') - -args = parser.parse_args() - -print("Your installation path: ", args.TLRMVM_ROOT) -print("Your workspace path: ", args.WORK_ROOT) - -# your TLRMVM_ROOT should point to your installation path. -TLRMVM_ROOT = args.TLRMVM_ROOT -sys.path.append( os.path.join( args.TLRMVM_ROOT,"python") ) -WORK_ROOT = args.WORK_ROOT -print("Downloading dataset to path: {}".format( WORK_ROOT )) -if not os.path.exists(WORK_ROOT): - os.mkdir(WORK_ROOT) -problemname = args.problemname - -from tlrmvmtools import * -dtype = None -datatype = args.datatype -if datatype == 'float': - dtype = np.float32 -elif datatype == 'double': - dtype = np.float64 -elif datatype == 'csingle': - dtype = np.csingle -elif datatype == 'cdouble': - dtype = np.cdouble -else: - print("Not support datatype.") - exit(1) - -# get your data matrix in numpy format -# you can download the matrix used in this example at -# https://drive.google.com/file/d/1KY4eeVSMm2mWOOKVU7QjsAf6tOREv-99/view?usp=sharing -A = np.load( os.path.join(WORK_ROOT, "{}.npy".format(problemname)) ).astype(dtype) - -if datatype in ['csingle', 'cdouble']: - randomx = np.random.rand(A.shape[1]) + 1j * np.random.rand(A.shape[1]) - randomx = randomx.astype(dtype) -else: - randomx = np.random.rand(A.shape[1]) - randomx = randomx.astype(dtype) - -""" -Below is needed for creating input of tlrmvm. -They are parameters Tile size (nb) and Accuracy Threshold (error_threshold) -you can play with to get decent performance and numerical accuracy. -""" -m = A.shape[0] -n = A.shape[1] -nb = args.nb -error_threshold = args.error_threshold # we use string for easy concatnating. -workplacefolder = WORK_ROOT -datasetname = args.problemname - -# create tlrmvm util class -tlrmvmutil = TLRMVM_Util(A, nb, workplacefolder, error_threshold, datasetname) -# compute svd and save -tlrmvmutil.computesvd() -# create input of tlrmvm -tlrmvmutil.saveUV() -# get compression info -tlrmvmutil.printdatainfo() - -tlrmvmutil.saveX(randomx) \ No newline at end of file diff --git a/pytlrmvm/src/Wrapper.cpp b/pytlrmvm/src/Wrapper.cpp index ffc5170617158710cf9c8e7b4a9e6dcc54e72d21..05ea3d204047ceeedd7c1f85d286129293cab2ef 100644 --- a/pytlrmvm/src/Wrapper.cpp +++ b/pytlrmvm/src/Wrapper.cpp @@ -157,7 +157,6 @@ PYBIND11_MODULE(TLRMVMpy, m) { m.def("BatchUpdatexgpu_INT8_cf", &BatchUpdatex_INT8); m.def("BatchUpdateygpu_INT8_cf", &BatchUpdatey_INT8); - addbatchtlrmvmgpufp16int8(m); m.def("SetMaskmat", &SetMaskmat); } diff --git a/pytlrmvm/tlrmvmtools.py b/pytlrmvm/tlrmvmtools.py index c8561451773cc4f08709dd343621fef5256e1a12..152c3192dba45008767e66f3e22a7d160ae1de4f 100644 --- a/pytlrmvm/tlrmvmtools.py +++ b/pytlrmvm/tlrmvmtools.py @@ -2,113 +2,118 @@ # @copyright (c) 2021- King Abdullah University of Science and # Technology (KAUST). All rights reserved. # -# Author: Yuxi Hong, 2021.10.27 -# Description: A tools for generating compressed U and V bases. -# They are input of TLR-MVM. +# Author: Yuxi Hong +# Description: A tools for generating compressed U and V bases +# which are input of TLR-MVM. ################################################################## import os from os.path import join, exists from tqdm import tqdm import numpy as np -import pickle +import pickle +from scipy.linalg import svd + class TLRMVM_Util: - """A TLR-MVM Utility class + """A TLR-MVM Utility class 1. compute svd for input of TLR-MVM 3. save U and V bases 4. save Dense matrix """ + def __init__(self, denseAarray, nb, datafolder, error_threshold, problemname, rankmodule) -> None: self.denseA = denseAarray self.dtype = denseAarray.dtype self.m = denseAarray.shape[0] self.n = denseAarray.shape[1] - self.nb = nb + + self.nb = nb self.mtg = self.m // nb if self.m % nb == 0 else self.m // nb + 1 self.ntg = self.n // nb if self.n % nb == 0 else self.n // nb + 1 - self.paddingm = self.mtg * nb - self.paddingn = self.ntg * nb + self.paddingm = self.mtg * nb + self.paddingn = self.ntg * nb self.datafolder = datafolder if not exists(self.datafolder): print("Folder {} not exists!".format(self.datafolder)) self.error_threshold = error_threshold self.problemname = problemname self.rankfile = join(self.datafolder, 'compresseddata', - '{}_Rmat_nb{}_acc{}.bin'.format(self.problemname,self.nb,self.error_threshold)) + '{}_Rmat_nb{}_acc{}.bin'.format(self.problemname, self.nb, self.error_threshold)) self.rankmodule = rankmodule def computesvd(self): A = self.denseA - padding_m = self.paddingm + padding_m = self.paddingm padding_n = self.paddingn - m = self.m - n = self.n - ntiles = self.ntg - mtiles = self.mtg + m = self.m + n = self.n + mtiles = self.mtg + ntiles = self.ntg svdsavepath = join(self.datafolder, 'SVDinfo') if not exists(svdsavepath): os.mkdir(svdsavepath) - nb = self.nb - svdname = join( svdsavepath, '{}_nb{}.pickle'.format(self.problemname,nb) ) + nb = self.nb + svdname = join(svdsavepath, '{}_nb{}.pickle'.format(self.problemname, nb)) if exists(svdname): print("svd {} exists.".format(svdname)) - return + return else: print("save svd to {}. ".format(svdname)) bigmap = dict() - padA = np.zeros((padding_m,padding_n),dtype=self.dtype) - padA[:m,:n] = A + padA = np.zeros((padding_m, padding_n), dtype=self.dtype) + padA[:m, :n] = A for j in tqdm(range(ntiles)): for i in range(mtiles): - curblock = padA[i*nb:(i+1)*nb, j*nb:(j+1)*nb] - [u,s,v] = np.linalg.svd(curblock) - bigmap['{}_{}'.format(i,j)] = [u,s,v] - with open( svdname,'wb') as f: - pickle.dump(bigmap, f) + curblock = padA[i * nb:(i + 1) * nb, j * nb:(j + 1) * nb] + [u, s, v] = svd(curblock) + bigmap['{}_{}'.format(i, j)] = [u, s, v] + with open(svdname, 'wb') as f: + pickle.dump(bigmap, f) def saveX(self, xvec): xfile = join(self.datafolder, '{}_x.bin'.format(self.problemname)) xvec.tofile(xfile) def saveUV(self): - svdname = join( self.datafolder, 'SVDinfo', '{}_nb{}.pickle'.format(self.problemname,self.nb) ) + svdname = join(self.datafolder, 'SVDinfo', '{}_nb{}.pickle'.format(self.problemname, self.nb)) if not exists(svdname): print("please do svd to matrix first!") with open(svdname, 'rb') as f: bigmap = pickle.load(f) - nb = self.nb + nb = self.nb acc = self.error_threshold - uvsavepath = join(self.datafolder,'compresseddata') + uvsavepath = join(self.datafolder, 'compresseddata') if not exists(uvsavepath): os.mkdir(uvsavepath) - ufile = uvsavepath + '/{}_Ubases_nb{}_acc{}.bin'.format(self.problemname,nb,acc) - vfile = uvsavepath + '/{}_Vbases_nb{}_acc{}.bin'.format(self.problemname,nb,acc) - rfile = uvsavepath + '/{}_Rmat_nb{}_acc{}.bin'.format(self.problemname,nb,acc) - if exists(ufile) and exists(vfile) and exists(rfile): - print("Compress file exists. ") - return + ufile = uvsavepath + '/{}_Ubases_nb{}_acc{}.bin'.format(self.problemname, nb, acc) + vfile = uvsavepath + '/{}_Vbases_nb{}_acc{}.bin'.format(self.problemname, nb, acc) + rfile = uvsavepath + '/{}_Rmat_nb{}_acc{}.bin'.format(self.problemname, nb, acc) + print("generate uvr file to {}.".format(uvsavepath)) - padding_m = self.paddingm + padding_m = self.paddingm padding_n = self.paddingn - m = self.m - n = self.n - ntiles = self.ntg - mtiles = self.mtg - uvsavepath = self.datafolder - nb = self.nb + m = self.m + n = self.n + ntiles = self.ntg + mtiles = self.mtg + nb = self.nb tmpacc = self.error_threshold - acc = tmpacc if isinstance(tmpacc,float) else float(tmpacc) - ApproximateA = np.zeros((padding_m, padding_n),dtype=self.dtype) - originpadA = np.zeros((padding_m, padding_n),dtype=self.dtype) - originpadA[:m,:n] = self.denseA - normA = np.linalg.norm(self.denseA,'fro') - ranklist = np.zeros(mtiles * ntiles,dtype=np.int32) + acc = tmpacc if isinstance(tmpacc, float) else float(tmpacc) + ApproximateA = np.zeros((padding_m, padding_n), dtype=self.dtype) + originpadA = np.zeros((padding_m, padding_n), dtype=self.dtype) + originpadA[:m, :n] = self.denseA + normA = np.linalg.norm(self.denseA, 'fro') + ranklist = np.zeros((mtiles, ntiles), dtype=np.int32) + print("rankmat shape, ", ranklist.shape) ulist = [[] for _ in range(mtiles)] vlist = [[] for _ in range(mtiles)] - for i in tqdm(range(mtiles)): - for j in range(ntiles): - curblock = originpadA[i*nb:(i+1)*nb, j*nb:(j+1)*nb] - [u,s,v] = bigmap['{}_{}'.format(i,j)] + + p = mtiles + for i in tqdm(range(mtiles - 1)): + for j in range(ntiles - 1): + curblock = originpadA[i * nb:(i + 1) * nb, j * nb:(j + 1) * nb] + normblock = np.linalg.norm(curblock, 'fro') + [u, s, v] = bigmap['{}_{}'.format(i, j)] srk = 0 erk = nb while srk != erk: @@ -116,54 +121,169 @@ class TLRMVM_Util: tmpu = u[:, :midrk] tmps = s[:midrk] tmpv = v[:midrk, :] - if np.linalg.norm(curblock-(tmpu*tmps)@tmpv, ord='fro') < normA * acc: + if np.linalg.norm(curblock - (tmpu * tmps) @ tmpv, ord='fro') < normA * acc: erk = midrk else: - srk = midrk+1 - if srk % self.rankmodule != 0: - mod4srk = ((srk//self.rankmodule) + 1) * self.rankmodule - else: - mod4srk = srk - mod4srk = min(mod4srk, nb) - srk = mod4srk + srk = midrk + 1 + if srk == 0: + srk = 1 tmpu = u[:, :srk] tmps = s[:srk] tmpv = v[:srk, :] + ApproximateA[i * nb:(i + 1) * nb, j * nb:(j + 1) * nb] = (tmpu * tmps) @ tmpv + us = tmpu * tmps + vt = tmpv if srk == 0: - ApproximateA[i*nb:(i+1)*nb, j*nb:(j+1)*nb] = np.zeros((nb,nb),dtype=self.dtype) + ranklist[i, j] = 1 + ulist[i].append(np.zeros((nb, 1), dtype=self.dtype)) + vlist[i].append(np.zeros((1, nb), dtype=self.dtype)) + else: + ranklist[i, j] = srk + ulist[i].append(us) + vlist[i].append(vt) + + def getsrk(normA, nb, acc, u, s, v): + srk = 0 + erk = nb + while srk != erk: + midrk = (srk + erk) // 2 + tmpu = u[:, :midrk] + tmps = s[:midrk] + tmpv = v[:midrk, :] + if np.linalg.norm(curblock - (tmpu * tmps) @ tmpv, ord='fro') < normA * acc: + erk = midrk else: - ApproximateA[i*nb:(i+1)*nb, j*nb:(j+1)*nb] = (tmpu*tmps)@tmpv - us = tmpu * tmps + srk = midrk + 1 + return srk + + for i in tqdm(range(mtiles)): + for j in range(ntiles): + if i < mtiles - 1 and j < ntiles - 1: + continue + curblock = originpadA[i * nb:(i + 1) * nb, j * nb:(j + 1) * nb] + normblock = np.linalg.norm(curblock, 'fro') + [u, s, v] = bigmap['{}_{}'.format(i, j)] + if i < mtiles - 1 or j < ntiles - 1: + if i == mtiles - 1: + presum = np.sum(ranklist[:, j]) + srk = getsrk(normA, nb, acc, u, s, v) + while srk < nb and (srk + presum) % self.rankmodule != 0: + srk += 1 + + if srk == nb and (srk + presum) % self.rankmodule != 0: + print("can't find a solution! i = mtiles") + exit() + else: + ranklist[i, j] = srk + elif j == ntiles - 1: + presum = np.sum(ranklist[i, :]) + srk = getsrk(normA, nb, acc, u, s, v) + while srk < nb and (srk + presum) % self.rankmodule != 0: + srk += 1 + if srk == nb and (srk + presum) % self.rankmodule != 0: + print("can't find a solution! j = ntiles") + exit() + else: + ranklist[i, j] = srk + elif i == mtiles - 1 and j == ntiles - 1: + srk = 0 + while srk < nb and (srk + np.sum(ranklist[i, :])) % self.rankmodule != 0 and \ + (srk + np.sum(ranklist[:, j])) % self.rankmodule != 0: + srk += 1 + if srk == nb: + print("can't find a solution!") + exit() + else: + ranklist[i, j] = srk + if srk == 0: + srk = self.rankmodule + tmpu = u[:, :srk] + tmps = s[:srk] + tmpv = v[:srk, :] + ApproximateA[i * nb:(i + 1) * nb, j * nb:(j + 1) * nb] = (tmpu * tmps) @ tmpv + us = tmpu * tmps vt = tmpv if srk == 0: - ranklist[j*mtiles+i] = 1 - ulist[i].append(np.zeros((nb,1),dtype=self.dtype)) - vlist[i].append(np.zeros((1,nb),dtype=self.dtype)) + ranklist[i, j] = 1 + ulist[i].append(np.zeros((nb, 1), dtype=self.dtype)) + vlist[i].append(np.zeros((1, nb), dtype=self.dtype)) else: - ranklist[j*mtiles+i] = srk + ranklist[i, j] = srk ulist[i].append(us) vlist[i].append(vt) tmpurow = [] for x in ulist: - tmpurow.append(np.concatenate(x,axis=1)) - finalu = np.concatenate(tmpurow,axis=1) + tmpurow.append(np.concatenate(x, axis=1)) + finalu = np.concatenate(tmpurow, axis=1) finalu.T.tofile(ufile) tmpvcol = [] - npvlist = np.array(vlist,dtype=np.object) + npvlist = np.array(vlist, dtype=np.object) for i in range(npvlist.shape[1]): - tmpvcol.append(np.concatenate(npvlist[:,i],axis=0)) - + tmpvcol.append(np.concatenate(npvlist[:, i], axis=0)) + with open(vfile, 'wb') as f: for x in tmpvcol: x.T.tofile(f) - ranklist.tofile(rfile) + ranklist.T.tofile(rfile) def printdatainfo(self): print("Description of Dataset: ") - print("problem name : {} ".format(self.problemname) ) + print("problem name : {} ".format(self.problemname)) print("m is {} n {} nb is {} error threshold is {}.".format(self.m, self.n, self.nb, self.error_threshold)) - rankfile = join(self.datafolder, 'compresseddata', '{}_Rmat_nb{}_acc{}.bin'.format(self.problemname, self.nb, self.error_threshold)) + rankfile = join(self.datafolder, 'compresseddata', + '{}_Rmat_nb{}_acc{}.bin'.format(self.problemname, self.nb, self.error_threshold)) self.ranklist = np.fromfile(rankfile, dtype=np.int32) - mn = self.m * self.n + mn = self.m * self.n rank = np.sum(self.ranklist) - print("Global rank is {}, compression rate is {:.3f}%.".format( rank, 2*rank*self.nb / mn * 100)) + print("Global rank is {}, compression rate is {:.3f}%.".format(rank, 2 * rank * self.nb / mn * 100)) + + +if __name__ == "__main__": + import numpy as np + import os + import argparse + from astropy.io.fits import open as fitsopen + from scipy.io import loadmat + parser = argparse.ArgumentParser() + parser.add_argument('--nb', type=int, help='nb') + parser.add_argument('--error_threshold', type=str,help='error threshold.') + parser.add_argument('--compressed_name', type=str, help='The file name for compressed U,V,and R.') + parser.add_argument('--data_dir', type=str, help='your original data dir.') + parser.add_argument('--data_type', type=str, help='datatype of dataset.') + parser.add_argument('--data_name', type=str, help='The name of original matrix.') + parser.add_argument('--matlabmat_name', type=str, default=None, help='The name of original matrix in .mat file.') + parser.add_argument('--rank_module', type=int, help='rank module.') + args = parser.parse_args() + dtype = None + datatype = args.data_type + if datatype == 'float': + dtype = np.float32 + elif datatype == 'double': + dtype = np.float64 + elif datatype == 'csingle': + dtype = np.csingle + elif datatype == 'cdouble': + dtype = np.cdouble + else: + print("Not support datatype.") + exit(1) + A = None + if args.data_name.split('.')[-1] == 'npy': + A = np.load(join(args.data_dir,args.data_name)).astype(dtype) + elif args.data_name.split('.')[-1] == 'fits': + A = fitsopen(join(args.data_dir,args.data_name))[0].data.astype(dtype) + elif args.data_name.split('.')[-1] == 'mat': + A = loadmat(join(args.data_dir,args.data_name))[args.matlabmat_name] + else: + A = pickle.load(open(join(args.data_dir,args.data_name))).astype(dtype) + rankmodule = int(args.rank_module) + if rankmodule == 0: + print("not 0.") + exit() + tlrmvmutil = TLRMVM_Util(A, args.nb, args.data_dir, args.error_threshold, args.compressed_name, rankmodule) + # compute svd and save + tlrmvmutil.computesvd() + # create input of tlrmvm + tlrmvmutil.saveUV() + # get compression info + tlrmvmutil.printdatainfo() diff --git a/setup.py b/setup.py index 69e874385d96dc68dfe463c7b87bd04a770d7846..f61cd4740a54f4c111894721319be129ff8d8eb1 100644 --- a/setup.py +++ b/setup.py @@ -64,7 +64,7 @@ class build_ext(build_ext_orig): "-DCMAKE_C_COMPILER={}".format(self.c_compiler), "-DCMAKE_CXX_COMPILER={}".format(self.cxx_compiler), "-DCMAKE_CUDA_HOST_COMPILER={}".format(self.cxx_compiler), - "-DCMAKE_CUDA_FLAGS='-ccbin {}'".format(self.cxx_compiler), + #"-DCMAKE_CUDA_FLAGS='-ccbin {}'".format(self.cxx_compiler), "-DUSE_MKL=ON", "-DUSE_MPI=ON", "-DBUILD_CUDA=ON", diff --git a/src/common/cpu/Util.cpp b/src/common/cpu/Util.cpp index 78bac44298e95b55385257a70328de72a817f258..8fb2883c80b7958e4828f139bde660944f4ea4cc 100644 --- a/src/common/cpu/Util.cpp +++ b/src/common/cpu/Util.cpp @@ -137,22 +137,16 @@ ArgsParser::ArgsParser(int argc, char**argv){ int ArgsParser::getint(string key){ if(argmap.find(key) == argmap.end()) - {cout << "key error in getint" << endl; exit(0);} + {cout << "key error in getint:" << key << endl; exit(0);} return atoi(argmap[key].c_str()); } string ArgsParser::getstring(string key){ if(argmap.find(key) == argmap.end()) - {cout << "key error in getstring" << endl; exit(0);} + {cout << "key error in getstring: "<< key << endl; exit(0);} return argmap[key]; } -bool ArgsParser::getbool(string key){ - if(argmap.find(key) == argmap.end()) - {cout << "key error in getint" << endl; exit(0);} - return atoi(argmap[key].c_str()); -} - template<typename T> diff --git a/src/tlrmvm/cpu/TlrmvmCPU.cpp b/src/tlrmvm/cpu/TlrmvmCPU.cpp index a8809225de20f3d6b3bbfc7b50d0604fa99978e9..d2e7c8c25edf2a62dc0a60411f66b39a94894a5a 100644 --- a/src/tlrmvm/cpu/TlrmvmCPU.cpp +++ b/src/tlrmvm/cpu/TlrmvmCPU.cpp @@ -7,6 +7,8 @@ #include "../../common/AppUtil.hpp" #include "../../common/cpu/Util.hpp" #include <memory> +#include <string.h> +#include <stdio.h> #ifdef USE_MPI #include <mpi.h> #endif @@ -124,7 +126,7 @@ void TlrmvmBase<T>::InitData(){ RandomX(Datax, config.originN); this->xmat = Matrix<T>(Datax, config.paddingN, 1); }else{ - char filename[200]; + char filename[300]; sprintf(filename, "%s/%s_Ubases_nb%d_acc%s.bin", config.datafolder.c_str(), config.problemname.c_str(),config.nb, config.acc.c_str()); size_t elems = config.granksum * config.nb; @@ -187,10 +189,10 @@ void TlrmvmBase<T>::Phase1GetMembuffer(){ template<typename T> void TlrmvmBase<T>::Phase1GetMembufferTranspose() { - GetHostMemory(&p1transptrs.Abp, config.Ntg); - GetHostMemory(&p1transptrs.xbp, config.Ntg); - GetHostMemory(&p1transptrs.ybp, config.Ntg); - for(int i=0; i<config.Ntg; i++){ + GetHostMemory(&p1transptrs.Abp, config.Mtg); + GetHostMemory(&p1transptrs.xbp, config.Mtg); + GetHostMemory(&p1transptrs.ybp, config.Mtg); + for(int i=0; i<config.Mtg; i++){ p1transptrs.Ms.push_back(config.rowsum[i]); p1transptrs.Ks.push_back(config.nb); p1transptrs.Ns.push_back(1); @@ -198,7 +200,7 @@ void TlrmvmBase<T>::Phase1GetMembufferTranspose() { p1transptrs.Acnt = 0; p1transptrs.Xcnt = 0; p1transptrs.Ycnt = 0; - for(int i=0; i<config.Ntg; i++){ + for(int i=0; i<config.Mtg; i++){ p1transptrs.Acnt += p1transptrs.Ms[i] * p1transptrs.Ks[i]; p1transptrs.Xcnt += p1transptrs.Ks[i] * p1transptrs.Ns[i]; p1transptrs.Ycnt += p1transptrs.Ms[i] * p1transptrs.Ns[i]; @@ -347,10 +349,10 @@ void TlrmvmBase<T>::Phase2PrepareTranspose() { template<typename T> void TlrmvmBase<T>::Phase3GetMembuffer(){ - GetHostMemory(&p3ptrs.Abp, config.Ntg); - GetHostMemory(&p3ptrs.xbp, config.Ntg); - GetHostMemory(&p3ptrs.ybp, config.Ntg); - for(int i=0; i<config.Ntg; i++){ + GetHostMemory(&p3ptrs.Abp, config.Mtg); + GetHostMemory(&p3ptrs.xbp, config.Mtg); + GetHostMemory(&p3ptrs.ybp, config.Mtg); + for(int i=0; i<config.Mtg; i++){ p3ptrs.Ms.push_back(config.nb); p3ptrs.Ks.push_back(config.rowsum[i]); p3ptrs.Ns.push_back(1); @@ -358,7 +360,7 @@ void TlrmvmBase<T>::Phase3GetMembuffer(){ p3ptrs.Acnt = 0; p3ptrs.Xcnt = 0; p3ptrs.Ycnt = 0; - for(int i=0; i<config.Ntg; i++){ + for(int i=0; i<config.Mtg; i++){ p3ptrs.Acnt += p3ptrs.Ms[i] * p3ptrs.Ks[i]; p3ptrs.Xcnt += p3ptrs.Ks[i] * p3ptrs.Ns[i]; p3ptrs.Ycnt += p3ptrs.Ms[i] * p3ptrs.Ns[i]; diff --git a/src/tlrmvm/hip/BatchTlrmvmhip.cpp b/src/tlrmvm/hip/BatchTlrmvmhip.cpp new file mode 100644 index 0000000000000000000000000000000000000000..835d647ef0123a3a5446ccd856d4990a2e6cc107 --- /dev/null +++ b/src/tlrmvm/hip/BatchTlrmvmhip.cpp @@ -0,0 +1,712 @@ +// +// Created by Yuxi Hong on 02/03/2022. +// + +#include "../../common/Common.hpp" +#include "../cpu/TlrmvmCPU.hpp" +#include <hip/hip_runtime.h> +#include <hip/hip_runtime_api.h> +#include <hipblas.h> +#include "BatchTlrmvmhip.hpp" +#include "hipkernel.cuh" +#include <chrono> + +namespace hiptlrmvm +{ + template<typename HostType, typename DeviceType> + BatchTlrmvmhip<HostType, DeviceType>::BatchTlrmvmhip(vector<TlrmvmConfig> tlrmvmconfigvec) + :config_vec(tlrmvmconfigvec),batchsize(tlrmvmconfigvec.size()) + { + cout << "calling Batch Tlrmvmcuda" << endl; +#ifdef USE_MPI + int initflag; + MPI_Initialized(&initflag); + if(initflag == 1){ + int rank; + int size; + MPI_Comm_rank(MPI_COMM_WORLD,&rank); + MPI_Comm_size(MPI_COMM_WORLD,&size); + if(rank == 0) + cout << "we are in mpi environment:" << endl; + int totaldevcount = 0; + HIPCHECK(hipGetDeviceCount(&totaldevcount)); + if(totaldevcount < size){ + if(rank == 0) + cout << "not enough cards, in debug mode, set all to 0." << endl; + HIPCHECK(hipSetDevice(0)); + }else{ + if(rank == 0) + cout << "we have enough cards, set to different cards." << endl; + HIPCHECK(hipSetDevice(rank%8)); + } + } + +#endif + cpuinstvec.resize(tlrmvmconfigvec.size()); + for(int i=0; i<tlrmvmconfigvec.size(); i++) + cpuinstvec[i] = std::move(make_shared<TlrmvmCPU<HostType>>(tlrmvmconfigvec[i])); + init_alpha_beta(alpha, beta); + finalresults.resize(tlrmvmconfigvec.size() * tlrmvmconfigvec[0].originM); + } + + template<typename HostType, typename DeviceType> + BatchTlrmvmhip<HostType, DeviceType>::BatchTlrmvmhip(){} + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::StreamInit(int streamsize){ + this->stream_size = streamsize; + streamptr = new hipStream_t[streamsize]; + cublashandleptr = new hipblasHandle_t[streamsize]; + for(int i=0; i<streamsize; i++) + hipStreamCreateWithFlags(&streamptr[i], hipStreamNonBlocking); + for(int i=0; i<streamsize; i++) + hipblasCreate(&cublashandleptr[i]); + for(int i=0; i<streamsize; i++) + hipblasSetStream(cublashandleptr[i], streamptr[i]); + multigraph.StreamInit(batchsize, streamsize); + transposemultigraph.StreamInit(batchsize, streamsize); + singlegraph.StreamInit(streamsize); + transposesinglegraph.StreamInit(streamsize); + } + + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::MemoryInit(){ + cudap1ptrs_vec.resize(batchsize); + cudap3ptrs_vec.resize(batchsize); + cudap1transptrs_vec.resize(batchsize); + cudap3transptrs_vec.resize(batchsize); + auto start = std::chrono::steady_clock::now(); +#pragma omp parallel for default(none) + for(int bi=0; bi<batchsize; bi++){ + cpuinstvec[bi]->MemoryInit(); + PhasePointersCopyNonPointers<HostType, DeviceType> + (cudap1ptrs_vec[bi],cpuinstvec[bi]->p1ptrs); + PhasePointersCopyNonPointers<HostType, DeviceType> + (cudap3ptrs_vec[bi],cpuinstvec[bi]->p3ptrs); + PhasePointersCopyNonPointers<HostType, DeviceType>( + cudap1transptrs_vec[bi],cpuinstvec[bi]->p1transptrs); + PhasePointersCopyNonPointers<HostType, DeviceType>( + cudap3transptrs_vec[bi],cpuinstvec[bi]->p3transptrs); + } + auto end = std::chrono::steady_clock::now(); + auto elapse_time = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count(); +#ifdef USE_MPI + int initflag; + MPI_Initialized(&initflag); + if(initflag == 1){ + int rank; + int size; + MPI_Comm_rank(MPI_COMM_WORLD,&rank); + MPI_Comm_size(MPI_COMM_WORLD,&size); + MPI_Barrier(MPI_COMM_WORLD); + auto recv_buffer = elapse_time; + MPI_Allreduce(&elapse_time, &recv_buffer, 1, + MPI_LONG_LONG, MPI_MAX, MPI_COMM_WORLD); + if(rank == 0) cout << "Reading data buffer takes time is " << recv_buffer * 1e-6 << " seconds."<< endl; + } +#else + cout << "Reading data buffer takes time is " << elapse_time * 1e-6 << endl; +#endif + Phase1Prepare(); + Phase2Prepare(); + Phase3Prepare(); + // transpose + Phase1PrepareTranspose(); + Phase2PrepareTranspose(); + Phase3PrepareTranspose(); + } + + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType, DeviceType>::SetTransposeConjugate(bool transpose, bool conjugate){ + this->transpose = transpose; + this->conjugate = conjugate; + for(auto &x : cpuinstvec) x->SetTransposeConjugate(transpose, conjugate); + } + + + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::setX(HostType * xvector, size_t xlength){ + int offset = 0; + assert(xlength == config_vec[0].originN * batchsize); + for(int i=0; i<cpuinstvec.size(); i++){ + cpuinstvec[i]->setX(xvector + offset , config_vec[i].originN); + offset += config_vec[i].originN; + } + } + + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::TryConjugateXvec() { + for(int bi=0; bi<batchsize; bi++){ + // no transpose logic + cpuinstvec[bi]->TryConjugateXvec(); + CopyDataB2HD((HostType*)cudap1ptrs_vec[bi].x, cpuinstvec[bi]->p1ptrs.x, cpuinstvec[bi]->xmat.Shape()[0]); + } + } + + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::Phase1(){ + hipDeviceSynchronize(); + for(int bi=0; bi < batchsize; bi++){ + for(int i=0; i<config_vec[bi].Ntg; i++){ + if(cudap1ptrs_vec[bi].Ms[i] != 0){ + hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N, + cudap1ptrs_vec[bi].Ms[i], cudap1ptrs_vec[bi].Ks[i], + &alpha, (const DeviceType*)cudap1ptrs_vec[bi].Abp[i], cudap1ptrs_vec[bi].Ms[i], + (const DeviceType*)cudap1ptrs_vec[bi].xbp[i], 1, &beta, + cudap1ptrs_vec[bi].ybp[i], 1); + } + } + } + hipDeviceSynchronize(); + } + + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::Phase1Transpose(){ + hipDeviceSynchronize(); + for(int bi=0; bi < batchsize; bi++){ + for(int i=0; i<config_vec[bi].Ntg; i++){ + if(cudap1transptrs_vec[bi].Ms[i] != 0){ + hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T, + cudap1transptrs_vec[bi].Ks[i], cudap1transptrs_vec[bi].Ms[i], + &alpha, (const DeviceType*)cudap1transptrs_vec[bi].Abp[i], + cudap1transptrs_vec[bi].Ks[i], + (const DeviceType*)cudap1transptrs_vec[bi].xbp[i], 1, &beta, + cudap1transptrs_vec[bi].ybp[i], 1); + } + } + } + hipDeviceSynchronize(); + } + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::Phase1Prepare() { + + for(int bi=0; bi<batchsize; bi++){ + int curbatch = cudap1ptrs_vec[bi].Ms.size(); + GethipHostMemory(&cudap1ptrs_vec[bi].Abp, curbatch); + GethipHostMemory(&cudap1ptrs_vec[bi].xbp, curbatch); + GethipHostMemory(&cudap1ptrs_vec[bi].ybp, curbatch); + } + for(int bi=0; bi<batchsize; bi++){ + GetDeviceMemory(&cudap1ptrs_vec[bi].A, cudap1ptrs_vec[bi].Acnt); + GetDeviceMemory(&cudap1ptrs_vec[bi].x, cudap1ptrs_vec[bi].Xcnt); + GetDeviceMemory(&cudap1ptrs_vec[bi].y, cudap1ptrs_vec[bi].Ycnt); + cudap1ptrs_vec[bi].Abp[0] = cudap1ptrs_vec[bi].A; + cudap1ptrs_vec[bi].xbp[0] = cudap1ptrs_vec[bi].x; + cudap1ptrs_vec[bi].ybp[0] = cudap1ptrs_vec[bi].y; + } + for(int bi=0; bi<batchsize; bi++){ + auto AvMs = cudap1ptrs_vec[bi].Ms; + auto AvNs = cudap1ptrs_vec[bi].Ns; + auto AvKs = cudap1ptrs_vec[bi].Ks; + for(int i=1; i<config_vec[bi].Ntg; i++){ + size_t AvMK = AvMs[i-1] * AvKs[i-1]; + size_t AvKN = AvKs[i-1] * AvNs[i-1]; + size_t AvMN = AvMs[i-1] * AvNs[i-1]; + cudap1ptrs_vec[bi].Abp[i] =cudap1ptrs_vec[bi].Abp[i-1] + AvMK; + cudap1ptrs_vec[bi].xbp[i] = cudap1ptrs_vec[bi].xbp[i-1] + AvKN; + cudap1ptrs_vec[bi].ybp[i] = cudap1ptrs_vec[bi].ybp[i-1] + AvMN; + } + // load phase1 A,x to GPU + CopyDataB2HD((HostType*)cudap1ptrs_vec[bi].A, cpuinstvec[bi]->p1ptrs.A, cpuinstvec[bi]->p1ptrs.Acnt); + CopyDataB2HD((HostType*)cudap1ptrs_vec[bi].x, cpuinstvec[bi]->p1ptrs.x, cpuinstvec[bi]->p1ptrs.Xcnt); + } + + } + + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::Phase1PrepareTranspose() { + for(int bi=0; bi < batchsize; bi++){ + int curbatch = cudap1ptrs_vec[bi].Ms.size(); + GethipHostMemory(&cudap1transptrs_vec[bi].Abp, curbatch); + GethipHostMemory(&cudap1transptrs_vec[bi].xbp, curbatch); + GethipHostMemory(&cudap1transptrs_vec[bi].ybp, curbatch); + } + for(int bi=0; bi<batchsize; bi++){ + cudap1transptrs_vec[bi].A = cudap3ptrs_vec[bi].A; + cudap1transptrs_vec[bi].x = cudap1ptrs_vec[bi].x; + GetDeviceMemory(&cudap1transptrs_vec[bi].y, cudap1transptrs_vec[bi].Ycnt); + cudap1transptrs_vec[bi].Abp[0] = cudap3ptrs_vec[bi].A; // use phase 3, U bases + cudap1transptrs_vec[bi].xbp[0] = cudap1ptrs_vec[bi].x; // use phase 1, x + cudap1transptrs_vec[bi].ybp[0] = cudap1transptrs_vec[bi].y; // create a new buffer + } + for(int bi=0; bi<batchsize; bi++){ + for(int i=1; i<cudap1transptrs_vec[bi].Ms.size(); i++){ + size_t AvMK = cudap1transptrs_vec[bi].Ms[i-1] * cudap1transptrs_vec[bi].Ks[i-1]; + size_t AvKN = cudap1transptrs_vec[bi].Ks[i-1] * cudap1transptrs_vec[bi].Ns[i-1]; + size_t AvMN = cudap1transptrs_vec[bi].Ms[i-1] * cudap1transptrs_vec[bi].Ns[i-1]; + cudap1transptrs_vec[bi].Abp[i] = cudap1transptrs_vec[bi].Abp[i-1] + AvMK; + cudap1transptrs_vec[bi].xbp[i] = cudap1transptrs_vec[bi].xbp[i-1] + AvKN; + cudap1transptrs_vec[bi].ybp[i] = cudap1transptrs_vec[bi].ybp[i-1] + AvMN; + } + } + } + + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::Phase2(){ + hipDeviceSynchronize(); + for(int bi=0; bi<batchsize; bi++){ + phase2_nosplit<DeviceType>(cudap1ptrs_vec[bi].y, d_phase2mapping_vec[bi], + cudap3ptrs_vec[bi].x, + config_vec[bi].granksum, streamptr[bi % stream_size]); + } + hipDeviceSynchronize(); + } + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::Phase2Transpose(){ + hipDeviceSynchronize(); + for(int bi=0; bi<batchsize; bi++){ + phase2_nosplit<DeviceType>(cudap1transptrs_vec[bi].y, d_phase2mapping_transpose_vec[bi], + cudap3transptrs_vec[bi].x, config_vec[bi].granksum, + streamptr[bi%stream_size]); + } + hipDeviceSynchronize(); + } + + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::Phase2Prepare(){ + d_phase2mapping_vec = new size_t*[batchsize]; + for(int bi=0; bi<batchsize; bi++){ + GetDeviceMemory(&d_phase2mapping_vec[bi], cpuinstvec[bi]->h_phase2mapping.size()); + CopyDataB2HD(d_phase2mapping_vec[bi], cpuinstvec[bi]->h_phase2mapping.data(), + cpuinstvec[bi]->h_phase2mapping.size()); + } + } + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::Phase2PrepareTranspose(){ + d_phase2mapping_transpose_vec = new size_t*[batchsize]; + for(int bi=0; bi<batchsize; bi++){ + GetDeviceMemory(&d_phase2mapping_transpose_vec[bi], cpuinstvec[bi]->h_phase2mappingTranspose.size()); + CopyDataB2HD(d_phase2mapping_transpose_vec[bi], cpuinstvec[bi]->h_phase2mappingTranspose.data(), + cpuinstvec[bi]->h_phase2mappingTranspose.size()); + } + } + + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::Phase3(){ + hipDeviceSynchronize(); + for(int bi=0; bi<batchsize; bi++){ + for(int i=0; i<config_vec[bi].Mtg; i++){ + hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N, + cudap3ptrs_vec[bi].Ms[i], cudap3ptrs_vec[bi].Ks[i], + &alpha, (const DeviceType*)cudap3ptrs_vec[bi].Abp[i], cudap3ptrs_vec[bi].Ms[i], + (const DeviceType*)cudap3ptrs_vec[bi].xbp[i], 1, &beta,cudap3ptrs_vec[bi].ybp[i], 1); + } + } + hipDeviceSynchronize(); + } + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::Phase3Transpose(){ + hipDeviceSynchronize(); + for(int bi=0; bi<batchsize; bi++){ + for(int i=0; i<config_vec[bi].Mtg; i++){ + hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T, + cudap3transptrs_vec[bi].Ks[i], cudap3transptrs_vec[bi].Ms[i], + &alpha, (const DeviceType*)cudap3transptrs_vec[bi].Abp[i], + cudap3transptrs_vec[bi].Ks[i], + (const DeviceType*)cudap3transptrs_vec[bi].xbp[i], 1, + &beta,cudap3transptrs_vec[bi].ybp[i], 1); + } + } + hipDeviceSynchronize(); + } + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::Phase3Prepare() { + for(int bi=0; bi<batchsize; bi++){ + int curbatch = cudap3ptrs_vec[bi].Ms.size(); + GethipHostMemory(&cudap3ptrs_vec[bi].Abp, curbatch); + GethipHostMemory(&cudap3ptrs_vec[bi].xbp, curbatch); + GethipHostMemory(&cudap3ptrs_vec[bi].ybp, curbatch); + } + for(int bi=0; bi<batchsize; bi++){ + GetDeviceMemory(&cudap3ptrs_vec[bi].A, cudap3ptrs_vec[bi].Acnt); + GetDeviceMemory(&cudap3ptrs_vec[bi].x, cudap3ptrs_vec[bi].Xcnt); + GetDeviceMemory(&cudap3ptrs_vec[bi].y, cudap3ptrs_vec[bi].Ycnt); + cudap3ptrs_vec[bi].Abp[0] = cudap3ptrs_vec[bi].A; + cudap3ptrs_vec[bi].xbp[0] = cudap3ptrs_vec[bi].x; + cudap3ptrs_vec[bi].ybp[0] = cudap3ptrs_vec[bi].y; + } + for(int bi=0; bi<batchsize; bi++){ + auto AuMs = cudap3ptrs_vec[bi].Ms; + auto AuNs = cudap3ptrs_vec[bi].Ns; + auto AuKs = cudap3ptrs_vec[bi].Ks; + for(int i=1; i<cpuinstvec[bi]->config.Mtg; i++){ + size_t AuMK = AuMs[i-1] * AuKs[i-1]; + size_t AuKN = AuKs[i-1] * AuNs[i-1]; + size_t AuMN = AuMs[i-1] * AuNs[i-1]; + cudap3ptrs_vec[bi].Abp[i] = cudap3ptrs_vec[bi].Abp[i-1] + AuMK; + cudap3ptrs_vec[bi].xbp[i] = cudap3ptrs_vec[bi].xbp[i-1] + AuKN; + cudap3ptrs_vec[bi].ybp[i] = cudap3ptrs_vec[bi].ybp[i-1] + AuMN; + } + // load phase 3 A to GPU + CopyDataB2HD((HostType*)cudap3ptrs_vec[bi].A, cpuinstvec[bi]->p3ptrs.A, cudap3ptrs_vec[bi].Acnt); + } + } + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::Phase3PrepareTranspose() { + for(int bi=0; bi<batchsize; bi++){ + int curbatch = cudap3transptrs_vec[bi].Ms.size(); + GethipHostMemory(&cudap3transptrs_vec[bi].Abp, curbatch); + GethipHostMemory(&cudap3transptrs_vec[bi].xbp, curbatch); + GethipHostMemory(&cudap3transptrs_vec[bi].ybp, curbatch); + } + for(int bi=0; bi<batchsize; bi++){ + cudap3transptrs_vec[bi].A = cudap1ptrs_vec[bi].A; + cudap3transptrs_vec[bi].x = cudap3ptrs_vec[bi].x; + GetDeviceMemory(&cudap3transptrs_vec[bi].y, cudap3transptrs_vec[bi].Ycnt); + cudap3transptrs_vec[bi].Abp[0] = cudap1ptrs_vec[bi].A; // use phase 1, V bases + cudap3transptrs_vec[bi].xbp[0] = cudap3ptrs_vec[bi].x; // use phase 3, x + cudap3transptrs_vec[bi].ybp[0] = cudap3transptrs_vec[bi].y; // create a new buffer + } + for(int bi=0; bi<batchsize; bi++){ + for(int i=1; i<cudap3transptrs_vec[bi].Ms.size(); i++){ + size_t AvMK = cudap3transptrs_vec[bi].Ms[i-1] * cudap3transptrs_vec[bi].Ks[i-1]; + size_t AvKN = cudap3transptrs_vec[bi].Ks[i-1] * cudap3transptrs_vec[bi].Ns[i-1]; + size_t AvMN = cudap3transptrs_vec[bi].Ms[i-1] * cudap3transptrs_vec[bi].Ns[i-1]; + cudap3transptrs_vec[bi].Abp[i] = cudap3transptrs_vec[bi].Abp[i-1] + AvMK; + cudap3transptrs_vec[bi].xbp[i] = cudap3transptrs_vec[bi].xbp[i-1] + AvKN; + cudap3transptrs_vec[bi].ybp[i] = cudap3transptrs_vec[bi].ybp[i-1] + AvMN; + } + // no need to copy data. + } + } + + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::MVM_MultiGraph(){ + if(transpose){ + MVM_MultiGraphTranspose(); + }else{ + MVM_MultiGraphNoTranspose(); + } + } + + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::MVM_MultiGraphTranspose(){ + auto & graphCreated = transposemultigraph.graphCreated; + auto & event_start = transposemultigraph.event_start; + auto & events = transposemultigraph.events; + auto & graph = transposemultigraph.graph; + auto & instance = transposemultigraph.instance; + for(int bi=0; bi<batchsize; bi++){ + if(!graphCreated[bi]){ + hipStreamBeginCapture(streamptr[0],hipStreamCaptureModeGlobal); + hipEventRecord(event_start[bi], streamptr[0]); + for(int streami=1; streami<stream_size; streami++){ + hipStreamWaitEvent(streamptr[streami], event_start[bi],0); + } + // phase 1 transpose + for(int i=0; i<config_vec[bi].Ntg; i++){ + if(cudap1transptrs_vec[bi].Ms[i] != 0){ + hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T, + cudap1transptrs_vec[bi].Ks[i], cudap1transptrs_vec[bi].Ms[i], + &alpha, (const DeviceType*)cudap1transptrs_vec[bi].Abp[i], + cudap1transptrs_vec[bi].Ks[i], + (const DeviceType*)cudap1transptrs_vec[bi].xbp[i], 1, &beta, + cudap1transptrs_vec[bi].ybp[i], 1); + } + } + for(int streami=1; streami < stream_size; streami++){ + hipEventRecord(events[bi][streami], streamptr[streami]); + } + for(int streami=1; streami < stream_size; streami++){ + hipStreamWaitEvent(streamptr[0], events[bi][streami],0); + } + // phase 2 transpose + phase2_nosplit<DeviceType>(cudap1transptrs_vec[bi].y, + d_phase2mapping_transpose_vec[bi], + cudap3transptrs_vec[bi].x, + config_vec[bi].granksum, streamptr[0]); + hipEventRecord(events[bi][0], streamptr[0]); + for(int streami=1; streami < stream_size; streami++){ + hipStreamWaitEvent(streamptr[streami], events[bi][0],0); + } + // phase 3 transpose + for(int i=0; i<config_vec[bi].Mtg; i++){ + hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T, + cudap3transptrs_vec[bi].Ks[i], cudap3transptrs_vec[bi].Ms[i], + &alpha, (const DeviceType*)cudap3transptrs_vec[bi].Abp[i], + cudap3transptrs_vec[bi].Ks[i], + (const DeviceType*)cudap3transptrs_vec[bi].xbp[i], + 1, &beta,cudap3transptrs_vec[bi].ybp[i], 1); + } + // final merge + for(int streami=1; streami < stream_size; streami++){ + hipEventRecord(events[bi][stream_size + streami], streamptr[streami]); + } + for(int streami=1; streami < stream_size; streami++){ + hipStreamWaitEvent(streamptr[0], events[bi][stream_size + streami],0); + } + hipStreamEndCapture(streamptr[0], &graph[bi]); + hipGraphInstantiate(&instance[bi], graph[bi], nullptr, nullptr, 0); + graphCreated[bi] = true; + } + hipGraphLaunch(instance[bi], streamptr[0]); + hipStreamSynchronize(streamptr[0]); + } + } + + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::MVM_MultiGraphNoTranspose() + { + auto & graphCreated = multigraph.graphCreated; + auto & event_start = multigraph.event_start; + auto & events = multigraph.events; + auto & graph = multigraph.graph; + auto & instance = multigraph.instance; + for(int bi=0; bi<batchsize; bi++){ + if(!graphCreated[bi]){ + hipStreamBeginCapture(streamptr[0],hipStreamCaptureModeGlobal); + hipEventRecord(event_start[bi], streamptr[0]); + for(int streami=1; streami<stream_size; streami++){ + hipStreamWaitEvent(streamptr[streami], event_start[bi],0); + } + // phase 1 + for(int i=0; i<config_vec[bi].Ntg; i++){ + if(cudap1ptrs_vec[bi].Ms[i] != 0){ + hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N, + cudap1ptrs_vec[bi].Ms[i], cudap1ptrs_vec[bi].Ks[i], + &alpha, (const DeviceType*)cudap1ptrs_vec[bi].Abp[i], cudap1ptrs_vec[bi].Ms[i], + (const DeviceType*)cudap1ptrs_vec[bi].xbp[i], 1, &beta, + cudap1ptrs_vec[bi].ybp[i], 1); + } + } + for(int streami=1; streami < stream_size; streami++){ + hipEventRecord(events[bi][streami], streamptr[streami]); + } + for(int streami=1; streami < stream_size; streami++){ + hipStreamWaitEvent(streamptr[0], events[bi][streami],0); + } + // phase 2 + phase2_nosplit<DeviceType>(cudap1ptrs_vec[bi].y, + d_phase2mapping_vec[bi], cudap3ptrs_vec[bi].x, + config_vec[bi].granksum, streamptr[0]); + hipEventRecord(events[bi][0], streamptr[0]); + for(int streami=1; streami < stream_size; streami++){ + hipStreamWaitEvent(streamptr[streami], events[bi][0],0); + } + // phase 3 + for(int i=0; i<config_vec[bi].Mtg; i++){ + hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N, + cudap3ptrs_vec[bi].Ms[i], cudap3ptrs_vec[bi].Ks[i], + &alpha, (const DeviceType*)cudap3ptrs_vec[bi].Abp[i], cudap3ptrs_vec[bi].Ms[i], + (const DeviceType*)cudap3ptrs_vec[bi].xbp[i], 1, &beta,cudap3ptrs_vec[bi].ybp[i], 1); + } + // final merge + for(int streami=1; streami < stream_size; streami++){ + hipEventRecord(events[bi][stream_size + streami], streamptr[streami]); + } + for(int streami=1; streami < stream_size; streami++){ + hipStreamWaitEvent(streamptr[0], events[bi][stream_size + streami],0); + } + hipStreamEndCapture(streamptr[0], &graph[bi]); + hipGraphInstantiate(&instance[bi], + graph[bi], nullptr, nullptr, 0); + graphCreated[bi] = true; + } + hipGraphLaunch(instance[bi], streamptr[0]); + hipStreamSynchronize(streamptr[0]); + } + } + + + + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::MVM_SingleGraph() + { + if(transpose){ + MVM_MultiGraphTranspose(); + }else{ + MVM_MultiGraphNoTranspose(); + } + } + + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::MVM_SingleGraphTranspose() + { + auto & graphCreated = transposesinglegraph.graphCreated; + auto & event_start = transposesinglegraph.event_start; + auto & events = transposesinglegraph.events; + auto & graph = transposesinglegraph.graph; + auto & instance = transposesinglegraph.instance; + if(!graphCreated){ + hipStreamBeginCapture(streamptr[0],hipStreamCaptureModeGlobal); + transposesinglegraph.syncotherstreams(event_start, streamptr, stream_size); + for(int bi=0; bi<batchsize; bi++){ + // phase 1 transpose + for(int i=0; i<config_vec[bi].Ntg; i++){ + if(cudap1transptrs_vec[bi].Ms[i] != 0){ + hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T, + cudap1transptrs_vec[bi].Ks[i], cudap1transptrs_vec[bi].Ms[i], + &alpha, (const DeviceType*)cudap1transptrs_vec[bi].Abp[i], + cudap1transptrs_vec[bi].Ks[i], + (const DeviceType*)cudap1transptrs_vec[bi].xbp[i], 1, &beta, + cudap1transptrs_vec[bi].ybp[i], 1); + } + } + } + // phase 1 synchronization + transposesinglegraph.syncallstreams(events, streamptr, stream_size); + // phase 2 transpose + for(int bi=0; bi<batchsize; bi++){ + phase2_nosplit<DeviceType>(cudap1transptrs_vec[bi].y, + d_phase2mapping_transpose_vec[bi], + cudap3transptrs_vec[bi].x, + config_vec[bi].granksum, streamptr[0]); + } + // phase 2 synchronization + transposesinglegraph.syncallstreams(events+1*stream_size, streamptr, stream_size); + for(int bi=0; bi<batchsize; bi++){ + // phase 3 transpose + for(int i=0; i<config_vec[bi].Mtg; i++){ + hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T, + cudap3transptrs_vec[bi].Ks[i], cudap3transptrs_vec[bi].Ms[i], + &alpha, (const DeviceType*)cudap3transptrs_vec[bi].Abp[i], + cudap3transptrs_vec[bi].Ks[i], + (const DeviceType*)cudap3transptrs_vec[bi].xbp[i], + 1, &beta,cudap3transptrs_vec[bi].ybp[i], 1); + } + } + // final merge + transposesinglegraph.syncstream0(events+2*stream_size, streamptr, stream_size); + hipStreamEndCapture(streamptr[0], &graph); + hipGraphInstantiate(&instance, graph, + nullptr, nullptr, 0); + graphCreated = true; + } + hipGraphLaunch(instance, streamptr[0]); + hipStreamSynchronize(streamptr[0]); + } + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::MVM_SingleGraphNoTranspose() + { + auto & graphCreated = singlegraph.graphCreated; + auto & event_start = singlegraph.event_start; + auto & events = singlegraph.events; + auto & graph = singlegraph.graph; + auto & instance = singlegraph.instance; + + if(!graphCreated){ + hipStreamBeginCapture(streamptr[0],hipStreamCaptureModeGlobal); + singlegraph.syncotherstreams(event_start, streamptr, stream_size); + // phase 1 + for(int bi=0; bi<batchsize; bi++){ + for(int i=0; i<config_vec[bi].Ntg; i++){ + if(cudap1ptrs_vec[bi].Ms[i] != 0){ + hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N, + cudap1ptrs_vec[bi].Ms[i], cudap1ptrs_vec[bi].Ks[i], + &alpha, (const DeviceType*)cudap1ptrs_vec[bi].Abp[i], cudap1ptrs_vec[bi].Ms[i], + (const DeviceType*)cudap1ptrs_vec[bi].xbp[i], 1, &beta, + cudap1ptrs_vec[bi].ybp[i], 1); + } + } + } + // phase 1 synchronization + singlegraph.syncallstreams(events, streamptr, stream_size); + // phase 2 + for(int bi=0; bi<batchsize; bi++){ + phase2_nosplit<DeviceType>(cudap1ptrs_vec[bi].y, + d_phase2mapping_vec[bi], cudap3ptrs_vec[bi].x, + config_vec[bi].granksum, streamptr[bi%stream_size]); + } + // phase 2 synchronization + singlegraph.syncallstreams(events+1*stream_size, streamptr, stream_size); + // phase 3 + for(int bi=0; bi<batchsize; bi++){ + for(int i=0; i<config_vec[bi].Mtg; i++){ + hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N, + cudap3ptrs_vec[bi].Ms[i], cudap3ptrs_vec[bi].Ks[i], + &alpha, (const DeviceType*)cudap3ptrs_vec[bi].Abp[i], cudap3ptrs_vec[bi].Ms[i], + (const DeviceType*)cudap3ptrs_vec[bi].xbp[i], 1, + &beta,cudap3ptrs_vec[bi].ybp[i], 1); + } + } + // final merge + singlegraph.syncstream0(events+2*stream_size, streamptr, stream_size); + hipStreamEndCapture(streamptr[0], &graph); + hipGraphInstantiate(&instance, graph, + nullptr, nullptr, 0); + graphCreated = true; + } + hipGraphLaunch(instance, streamptr[0]); + hipStreamSynchronize(streamptr[0]); + } + + + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::TryConjugateResults() { +// if(!conjugate) return; + if(transpose){ + for(int bi=0; bi<config_vec.size(); bi++){ + if(conjugate) ConjugateDriver<DeviceType>(cudap3transptrs_vec[bi].y,config_vec[bi].originN, streamptr[0]); + CopyDataB2HD(cpuinstvec[bi]->p3transptrs.y, (HostType*)cudap3transptrs_vec[bi].y,cpuinstvec[bi]->config.originM); + } + }else{ + for(int bi=0; bi<config_vec.size(); bi++){ + if(conjugate) ConjugateDriver<DeviceType>(cudap3ptrs_vec[bi].y,config_vec[bi].originM,streamptr[0]); + CopyDataB2HD(cpuinstvec[bi]->p3ptrs.y, (HostType*)cudap3ptrs_vec[bi].y,cpuinstvec[bi]->config.originM); + } + } + } + + + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType, DeviceType>::CopyBackResults() + { + size_t offset = 0, origin = 0; + for(int bi=0; bi<batchsize; bi++){ + // use cpu pointers to send output + if(transpose){ + CopyDataB2HD(cpuinstvec[bi]->p1transptrs.y, (HostType*)cudap1transptrs_vec[bi].y, cpuinstvec[bi]->config.granksum); + CopyDataB2HD(cpuinstvec[bi]->p3transptrs.x, (HostType*)cudap3transptrs_vec[bi].x, cpuinstvec[bi]->config.granksum); + CopyDataB2HD(cpuinstvec[bi]->p3transptrs.y, (HostType*)cudap3transptrs_vec[bi].y, cpuinstvec[bi]->config.originM); + origin = cpuinstvec[bi]->config.originM; +// memcpy(cpuinstvec[bi]->p3transptrs.y, &alpha, sizeof(HostType)); + memcpy(finalresults.data() + offset,cpuinstvec[bi]->p3transptrs.y, sizeof(HostType) * origin); + offset += cpuinstvec[bi]->config.originM; + }else{ + CopyDataB2HD(cpuinstvec[bi]->p1ptrs.y, (HostType*)cudap1ptrs_vec[bi].y, cpuinstvec[bi]->config.granksum); + CopyDataB2HD(cpuinstvec[bi]->p3ptrs.x, (HostType*)cudap3ptrs_vec[bi].x, cpuinstvec[bi]->config.granksum); + CopyDataB2HD(cpuinstvec[bi]->p3ptrs.y, (HostType*)cudap3ptrs_vec[bi].y, cpuinstvec[bi]->config.originM); + origin = cpuinstvec[bi]->config.originM; + memcpy(finalresults.data() + offset,cpuinstvec[bi]->p3ptrs.y, sizeof(HostType) * origin); + offset += cpuinstvec[bi]->config.originM; + } + cpuinstvec[bi]->CopyToFinalresults(); + } + } + + template<typename HostType, typename DeviceType> + void BatchTlrmvmhip<HostType,DeviceType>::MemoryFree(){ + for(int bi=0; bi<batchsize; bi++){ + cpuinstvec[bi]->MemoryFree(); + FreehipHostMemory(cudap1ptrs_vec[bi].Abp); + FreehipHostMemory(cudap1ptrs_vec[bi].xbp); + FreehipHostMemory(cudap1ptrs_vec[bi].ybp); + FreeDeviceMemory(cudap1ptrs_vec[bi].A); + FreeDeviceMemory(cudap1ptrs_vec[bi].x); + FreeDeviceMemory(cudap1ptrs_vec[bi].y); + + FreehipHostMemory(cudap3ptrs_vec[bi].Abp); + FreehipHostMemory(cudap3ptrs_vec[bi].xbp); + FreehipHostMemory(cudap3ptrs_vec[bi].ybp); + FreeDeviceMemory(cudap3ptrs_vec[bi].A); + FreeDeviceMemory(cudap3ptrs_vec[bi].x); + FreeDeviceMemory(cudap3ptrs_vec[bi].y); + + FreehipHostMemory(cudap1transptrs_vec[bi].Abp); + FreehipHostMemory(cudap1transptrs_vec[bi].xbp); + FreehipHostMemory(cudap1transptrs_vec[bi].ybp); + FreeDeviceMemory(cudap1transptrs_vec[bi].y); + + FreehipHostMemory(cudap3transptrs_vec[bi].Abp); + FreehipHostMemory(cudap3transptrs_vec[bi].xbp); + FreehipHostMemory(cudap3transptrs_vec[bi].ybp); + FreeDeviceMemory(cudap3transptrs_vec[bi].y); + } + } + + template class BatchTlrmvmhip<float, float>; + template class BatchTlrmvmhip<double, double>; + template class BatchTlrmvmhip<complex<float>, hipComplex>; + template class BatchTlrmvmhip<complex<double>, hipDoubleComplex>; + + +} \ No newline at end of file diff --git a/src/tlrmvm/hip/BatchTlrmvmhip.hpp b/src/tlrmvm/hip/BatchTlrmvmhip.hpp new file mode 100644 index 0000000000000000000000000000000000000000..bb2bd281cbcb68bf3c1b5bc57143557008df2a67 --- /dev/null +++ b/src/tlrmvm/hip/BatchTlrmvmhip.hpp @@ -0,0 +1,84 @@ +#pragma once + +#include <vector> +using std::vector; + + +#include "../cpu/TlrmvmCPU.hpp" +#include "Tlrmvmhip.hpp" +#include "tlrmvmhiputil.hpp" +#include <memory> + +#ifdef USE_MPI +#include <mpi.h> +#endif + +namespace hiptlrmvm +{ + template<typename HostType, typename DeviceType> + class BatchTlrmvmhip + { + public: + explicit BatchTlrmvmhip(vector<TlrmvmConfig> tlrmvmconfigvec); + BatchTlrmvmhip(); + void StreamInit(int streamsize); + void StreamDestroy(); + void MemoryInit(); + void MemoryFree(); + void Phase1(); + void Phase1Transpose(); + void Phase1Prepare(); + void Phase1PrepareTranspose(); + void Phase2(); + void Phase2Transpose(); + void Phase2Prepare(); + void Phase2PrepareTranspose(); + void Phase3(); + void Phase3Transpose(); + void Phase3Prepare(); + void Phase3PrepareTranspose(); + void MVM_SingleGraph(); + void MVM_SingleGraphTranspose(); + void MVM_SingleGraphNoTranspose(); + void MVM_MultiGraph(); + void MVM_MultiGraphTranspose(); + void MVM_MultiGraphNoTranspose(); + + // seperate 2 functions. + void SetTransposeConjugate(bool transpose, bool conjugate); + void setX(HostType * xvector, size_t xlength); + void TryConjugateXvec(); + void TryConjugateResults(); + void CopyBackResults(); + + bool transpose; + bool conjugate; + + int batchsize; + // cpu instance + vector<TlrmvmConfig> config_vec; + vector<std::shared_ptr<TlrmvmCPU<HostType>>> cpuinstvec; + + // GPU resources + hipStream_t * streamptr; + hipblasHandle_t * cublashandleptr; + int stream_size; + + MultiGraph multigraph; + MultiGraph transposemultigraph; + SingleGraph singlegraph; + SingleGraph transposesinglegraph; + + DeviceType alpha; + DeviceType beta; + // gpu pointers + vector<HIPPhasePointers<DeviceType>> cudap1ptrs_vec; + vector<HIPPhasePointers<DeviceType>> cudap1transptrs_vec; + size_t * *d_phase2mapping_vec; + size_t * *d_phase2mapping_transpose_vec; + vector<HIPPhasePointers<DeviceType>> cudap3ptrs_vec; + vector<HIPPhasePointers<DeviceType>> cudap3transptrs_vec; + vector<HostType> finalresults; + }; +} + diff --git a/src/tlrmvm/hip/Tlrmvmhip.cpp b/src/tlrmvm/hip/Tlrmvmhip.cpp new file mode 100644 index 0000000000000000000000000000000000000000..12320c1a449863c87de1a4126a0b1fb5a1a2ab45 --- /dev/null +++ b/src/tlrmvm/hip/Tlrmvmhip.cpp @@ -0,0 +1,607 @@ +#include <hip/hip_runtime.h> +#include <hip/hip_runtime_api.h> +#include <hipblas.h> + +#include "../../common/Common.hpp" +#include "../../common/AppUtil.hpp" +#include "../cpu/TlrmvmCPU.hpp" +#include "Tlrmvmhip.hpp" +#include "hipkernel.cuh" + +namespace hiptlrmvm +{ + + template<typename T> + HIPPhasePointers<T>::HIPPhasePointers(){} + + template struct HIPPhasePointers<float>; + template struct HIPPhasePointers<double>; + template struct HIPPhasePointers<hipComplex>; + template struct HIPPhasePointers<hipDoubleComplex>; + + template<typename SrcType, typename DestType> + void PhasePointersCopyNonPointers(HIPPhasePointers<DestType> &dest, const PhasePointers<SrcType> &src){ + dest.Acnt = src.Acnt; + dest.Xcnt = src.Xcnt; + dest.Ycnt = src.Ycnt; + dest.Ms = src.Ms; + dest.Ks = src.Ks; + dest.Ns = src.Ns; + } + + template void PhasePointersCopyNonPointers<float,float>(HIPPhasePointers<float> &, + const PhasePointers<float>&); + template void PhasePointersCopyNonPointers<double,double>(HIPPhasePointers<double> &, + const PhasePointers<double>&); + template void PhasePointersCopyNonPointers<complex<float>,hipComplex> + (HIPPhasePointers<hipComplex> &, const PhasePointers<complex<float>>&); + template void PhasePointersCopyNonPointers<complex<double>,hipDoubleComplex> + (HIPPhasePointers<hipDoubleComplex> &, const PhasePointers<complex<double>>&); + + template<typename HostType, typename DeviceType> + Tlrmvmhip<HostType, DeviceType>::Tlrmvmhip() {} + + template<typename HostType, typename DeviceType> + Tlrmvmhip<HostType, DeviceType>::Tlrmvmhip(TlrmvmConfig tlrmvmconfig) + :config(tlrmvmconfig) + { + transpose = false; + conjugate = false; + init_alpha_beta(alpha, beta); + tlrmvmcpu = std::make_shared<TlrmvmCPU<HostType>>(tlrmvmconfig); + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType, DeviceType>::UpdateConfig(TlrmvmConfig &tlrmvmconfig) + { +// transpose = false; +// conjugate = false; +// init_alpha_beta(alpha, beta); +// tlrmvmcpu->UpdateConfig(tlrmvmconfig); + cout << "UpdateConfig not implemented." << endl; + exit(0); + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::setX(HostType * xvector, size_t xlength){ + tlrmvmcpu->setX(xvector, xlength); + tlrmvmcpu->TryConjugateXvec(); + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::TryConjugateXvec() { + // no transpose logic + tlrmvmcpu->TryConjugateXvec(); + CopyDataB2HD((HostType*)this->cudap1ptrs.x, tlrmvmcpu->p1ptrs.x, tlrmvmcpu->xmat.Shape()[0]); + } + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::TryConjugateResults() { + if(!conjugate) return; + if(transpose){ + ConjugateDriver<DeviceType>(cudap3transptrs.y, config.originN, streamptr[0]); + CopyDataB2HD(tlrmvmcpu->p3transptrs.y, (HostType*)cudap3transptrs.y, tlrmvmcpu->config.originM); + }else{ + ConjugateDriver<DeviceType>(cudap3ptrs.y, config.originM, streamptr[0]); + CopyDataB2HD(tlrmvmcpu->p3ptrs.y, (HostType*)cudap3ptrs.y, tlrmvmcpu->config.originM); + } + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::StreamInit(int streamsize){ + this->stream_size = streamsize; + streamptr = new hipStream_t[streamsize]; + cublashandleptr = new hipblasHandle_t[streamsize]; + for(int i=0; i<streamsize; i++) + hipStreamCreateWithFlags(&streamptr[i], hipStreamNonBlocking); + for(int i=0; i<streamsize; i++) + hipblasCreate(&cublashandleptr[i]); + for(int i=0; i<streamsize; i++) + hipblasSetStream(cublashandleptr[i], streamptr[i]); + HIPCHECK(hipEventCreate(&event_start)); + HIPCHECK(hipEventCreate(&event_phase2finish)); + events = new hipEvent_t[2*streamsize]; + for(int i=0; i<2*streamsize; i++) HIPCHECK(hipEventCreate(&events[i])); + // graph + graphCreated = false; + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::StreamDestroy(){ + for(int i=0; i<stream_size; i++) hipblasDestroy(cublashandleptr[i]); + for(int i=0; i<stream_size; i++) hipStreamDestroy(streamptr[i]); + delete[] cublashandleptr; + delete[] streamptr; + HIPCHECK(hipEventDestroy(event_start)); + HIPCHECK(hipEventDestroy(event_phase2finish)); + for(int i=0; i<2*stream_size; i++) HIPCHECK(hipEventDestroy(events[i])); + delete[] events; + // graph + if(graphCreated){ + hipGraphExecDestroy(instance); + hipGraphDestroy(graph); + } + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::MemoryInit(){ + tlrmvmcpu->MemoryInit(); + PhasePointersCopyNonPointers<HostType, DeviceType>(cudap1ptrs, tlrmvmcpu->p1ptrs); + PhasePointersCopyNonPointers<HostType, DeviceType>(cudap3ptrs, tlrmvmcpu->p3ptrs); + PhasePointersCopyNonPointers<HostType, DeviceType>(cudap1transptrs, tlrmvmcpu->p1transptrs); + PhasePointersCopyNonPointers<HostType, DeviceType>(cudap3transptrs, tlrmvmcpu->p3transptrs); + Phase1GetMembuffer(); + AllocatePhase1Buffer(); + Phase1CopyData(); + Phase2Prepare(); + Phase3GetMembuffer(); + AllocatePhase3Buffer(); + Phase3CopyData(); + // transpose + Phase1GetMembufferTranspose(); + AllocatePhase1BufferTranspose(); + Phase1CopyDataTranspose(); + Phase2PrepareTranspose(); + Phase3GetMembufferTranspose(); + AllocatePhase3BufferTranspose(); + Phase3CopyDataTranspose(); + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::MemoryFree(){ + tlrmvmcpu->MemoryFree(); + FreehipHostMemory(cudap1ptrs.Abp); + FreehipHostMemory(cudap1ptrs.xbp); + FreehipHostMemory(cudap1ptrs.ybp); + FreeDeviceMemory(cudap1ptrs.A); + FreeDeviceMemory(cudap1ptrs.x); + FreeDeviceMemory(cudap1ptrs.y); + + FreehipHostMemory(cudap3ptrs.Abp); + FreehipHostMemory(cudap3ptrs.xbp); + FreehipHostMemory(cudap3ptrs.ybp); + FreeDeviceMemory(cudap3ptrs.A); + FreeDeviceMemory(cudap3ptrs.x); + FreeDeviceMemory(cudap3ptrs.y); + + FreehipHostMemory(cudap1transptrs.Abp); + FreehipHostMemory(cudap1transptrs.xbp); + FreehipHostMemory(cudap1transptrs.ybp); + FreeDeviceMemory(cudap1transptrs.y); + + FreehipHostMemory(cudap3transptrs.Abp); + FreehipHostMemory(cudap3transptrs.xbp); + FreehipHostMemory(cudap3transptrs.ybp); + FreeDeviceMemory(cudap3transptrs.y); + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::Phase1(){ + hipDeviceSynchronize(); + for(int i=0; i<config.Ntg; i++){ + if(cudap1ptrs.Ms[i] != 0){ + hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N, + cudap1ptrs.Ms[i], cudap1ptrs.Ks[i], + &alpha, (const DeviceType*)cudap1ptrs.Abp[i], cudap1ptrs.Ms[i], + (const DeviceType*)cudap1ptrs.xbp[i], 1, &beta, + cudap1ptrs.ybp[i], 1); + } + } + hipDeviceSynchronize(); + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::Phase1Transpose(){ + hipDeviceSynchronize(); + for(int i=0; i<config.Ntg; i++){ + if(cudap1transptrs.Ms[i] != 0){ + hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T, + cudap1transptrs.Ks[i], cudap1transptrs.Ms[i], + &alpha, (const DeviceType*)cudap1transptrs.Abp[i], + cudap1transptrs.Ks[i], + (const DeviceType*)cudap1transptrs.xbp[i], 1, &beta, + cudap1transptrs.ybp[i], 1); + } + } + hipDeviceSynchronize(); + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::Phase1GetMembuffer(){ + int batchsize = cudap1ptrs.Ms.size(); + GethipHostMemory(&cudap1ptrs.Abp, batchsize); + GethipHostMemory(&cudap1ptrs.xbp, batchsize); + GethipHostMemory(&cudap1ptrs.ybp, batchsize); + } + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::Phase1GetMembufferTranspose() + { + int batchsize = cudap1ptrs.Ms.size(); + GethipHostMemory(&cudap1transptrs.Abp, batchsize); + GethipHostMemory(&cudap1transptrs.xbp, batchsize); + GethipHostMemory(&cudap1transptrs.ybp, batchsize); + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::AllocatePhase1Buffer(){ + GetDeviceMemory(&cudap1ptrs.A, cudap1ptrs.Acnt); + GetDeviceMemory(&cudap1ptrs.x, cudap1ptrs.Xcnt); + GetDeviceMemory(&cudap1ptrs.y, cudap1ptrs.Ycnt); + cudap1ptrs.Abp[0] = cudap1ptrs.A; + cudap1ptrs.xbp[0] = cudap1ptrs.x; + cudap1ptrs.ybp[0] = cudap1ptrs.y; + } + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::AllocatePhase1BufferTranspose(){ + cudap1transptrs.A = cudap3ptrs.A; + cudap1transptrs.x = cudap1ptrs.x; + GetDeviceMemory(&cudap1transptrs.y, cudap1transptrs.Ycnt); + cudap1transptrs.Abp[0] = cudap3ptrs.A; // use phase 3, U bases + cudap1transptrs.xbp[0] = cudap1ptrs.x; // use phase 1, x + cudap1transptrs.ybp[0] = cudap1transptrs.y; // create a new buffer + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::Phase1CopyData(){ + auto AvMs = cudap1ptrs.Ms; + auto AvNs = cudap1ptrs.Ns; + auto AvKs = cudap1ptrs.Ks; + for(int i=1; i<config.Ntg; i++){ + size_t AvMK = AvMs[i-1] * AvKs[i-1]; + size_t AvKN = AvKs[i-1] * AvNs[i-1]; + size_t AvMN = AvMs[i-1] * AvNs[i-1]; + cudap1ptrs.Abp[i] =cudap1ptrs.Abp[i-1] + AvMK; + cudap1ptrs.xbp[i] = cudap1ptrs.xbp[i-1] + AvKN; + cudap1ptrs.ybp[i] = cudap1ptrs.ybp[i-1] + AvMN; + } + // load phase1 A,x to GPU + CopyDataB2HD((HostType*)cudap1ptrs.A, tlrmvmcpu->p1ptrs.A, tlrmvmcpu->p1ptrs.Acnt); + CopyDataB2HD((HostType*)cudap1ptrs.x, tlrmvmcpu->p1ptrs.x, tlrmvmcpu->p1ptrs.Xcnt); + } + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::Phase1CopyDataTranspose(){ + for(int i=1; i<cudap1transptrs.Ms.size(); i++){ + size_t AvMK = cudap1transptrs.Ms[i-1] * cudap1transptrs.Ks[i-1]; + size_t AvKN = cudap1transptrs.Ks[i-1] * cudap1transptrs.Ns[i-1]; + size_t AvMN = cudap1transptrs.Ms[i-1] * cudap1transptrs.Ns[i-1]; + cudap1transptrs.Abp[i] = cudap1transptrs.Abp[i-1] + AvMK; + cudap1transptrs.xbp[i] = cudap1transptrs.xbp[i-1] + AvKN; + cudap1transptrs.ybp[i] = cudap1transptrs.ybp[i-1] + AvMN; + } + // no need to copy data. data is copied in normal node. + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::Phase2(){ + hipDeviceSynchronize(); + phase2_nosplit<DeviceType>(cudap1ptrs.y, d_phase2mapping, cudap3ptrs.x, + config.granksum, streamptr[0]); + hipDeviceSynchronize(); + } + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::Phase2Transpose(){ + hipDeviceSynchronize(); + phase2_nosplit<DeviceType>(cudap1transptrs.y, d_phase2mapping_transpose, + cudap3transptrs.x, config.granksum, streamptr[0]); + hipDeviceSynchronize(); + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::Phase2Prepare(){ + GetDeviceMemory(&d_phase2mapping, tlrmvmcpu->h_phase2mapping.size()); + CopyDataB2HD(d_phase2mapping, tlrmvmcpu->h_phase2mapping.data(),tlrmvmcpu->h_phase2mapping.size()); + } + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::Phase2PrepareTranspose(){ + GetDeviceMemory(&d_phase2mapping_transpose, tlrmvmcpu->h_phase2mappingTranspose.size()); + CopyDataB2HD(d_phase2mapping_transpose, tlrmvmcpu->h_phase2mappingTranspose.data(), + tlrmvmcpu->h_phase2mappingTranspose.size()); + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::Phase3(){ + hipDeviceSynchronize(); + for(int i=0; i<config.Mtg; i++){ + hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N, + cudap3ptrs.Ms[i], cudap3ptrs.Ks[i], + &alpha, (const DeviceType*)cudap3ptrs.Abp[i], cudap3ptrs.Ms[i], + (const DeviceType*)cudap3ptrs.xbp[i], 1, &beta,cudap3ptrs.ybp[i], 1); + } + hipDeviceSynchronize(); + } + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::Phase3Transpose(){ + hipDeviceSynchronize(); + for(int i=0; i<config.Mtg; i++){ + hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T, + cudap3transptrs.Ks[i], cudap3transptrs.Ms[i], + &alpha, (const DeviceType*)cudap3transptrs.Abp[i], + cudap3transptrs.Ks[i], + (const DeviceType*)cudap3transptrs.xbp[i], 1, &beta,cudap3transptrs.ybp[i], 1); + } + hipDeviceSynchronize(); + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::Phase3GetMembuffer(){ + int batchsize = cudap3ptrs.Ms.size(); + GethipHostMemory(&cudap3ptrs.Abp, batchsize); + GethipHostMemory(&cudap3ptrs.xbp, batchsize); + GethipHostMemory(&cudap3ptrs.ybp, batchsize); + } + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::Phase3GetMembufferTranspose(){ + int batchsize = cudap3transptrs.Ms.size(); + GethipHostMemory(&cudap3transptrs.Abp, batchsize); + GethipHostMemory(&cudap3transptrs.xbp, batchsize); + GethipHostMemory(&cudap3transptrs.ybp, batchsize); + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::AllocatePhase3Buffer(){ + GetDeviceMemory(&cudap3ptrs.A, cudap3ptrs.Acnt); + GetDeviceMemory(&cudap3ptrs.x, cudap3ptrs.Xcnt); + GetDeviceMemory(&cudap3ptrs.y, cudap3ptrs.Ycnt); + cudap3ptrs.Abp[0] = cudap3ptrs.A; + cudap3ptrs.xbp[0] = cudap3ptrs.x; + cudap3ptrs.ybp[0] = cudap3ptrs.y; + } + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::AllocatePhase3BufferTranspose(){ + cudap3transptrs.A = cudap1ptrs.A; + cudap3transptrs.x = cudap3ptrs.x; + GetDeviceMemory(&cudap3transptrs.y, cudap3transptrs.Ycnt); + cudap3transptrs.Abp[0] = cudap1ptrs.A; // use phase 1, V bases + cudap3transptrs.xbp[0] = cudap3ptrs.x; // use phase 3, x + cudap3transptrs.ybp[0] = cudap3transptrs.y; // create a new buffer + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::Phase3CopyData(){ + auto AuMs = cudap3ptrs.Ms; + auto AuNs = cudap3ptrs.Ns; + auto AuKs = cudap3ptrs.Ks; + for(int i=1; i<tlrmvmcpu->config.Mtg; i++){ + size_t AuMK = AuMs[i-1] * AuKs[i-1]; + size_t AuKN = AuKs[i-1] * AuNs[i-1]; + size_t AuMN = AuMs[i-1] * AuNs[i-1]; + cudap3ptrs.Abp[i] = cudap3ptrs.Abp[i-1] + AuMK; + cudap3ptrs.xbp[i] = cudap3ptrs.xbp[i-1] + AuKN; + cudap3ptrs.ybp[i] = cudap3ptrs.ybp[i-1] + AuMN; + } + // load phase 3 A to GPU + CopyDataB2HD((HostType*)cudap3ptrs.A, tlrmvmcpu->p3ptrs.A, cudap3ptrs.Acnt); + } + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::Phase3CopyDataTranspose(){ + for(int i=1; i<cudap3transptrs.Ms.size(); i++){ + size_t AvMK = cudap3transptrs.Ms[i-1] * cudap3transptrs.Ks[i-1]; + size_t AvKN = cudap3transptrs.Ks[i-1] * cudap3transptrs.Ns[i-1]; + size_t AvMN = cudap3transptrs.Ms[i-1] * cudap3transptrs.Ns[i-1]; + cudap3transptrs.Abp[i] = cudap3transptrs.Abp[i-1] + AvMK; + cudap3transptrs.xbp[i] = cudap3transptrs.xbp[i-1] + AvKN; + cudap3transptrs.ybp[i] = cudap3transptrs.ybp[i-1] + AvMN; + } + // no need to copy data. + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType, DeviceType>::MVMGraphTranspose() { + if(!graphCreated){ + hipStreamBeginCapture(streamptr[0],hipStreamCaptureModeGlobal); + hipEventRecord(event_start, streamptr[0]); + for(int streami=1; streami<stream_size; streami++){ + hipStreamWaitEvent(streamptr[streami], event_start,0); + } + // phase 1 transpose + for(int i=0; i<config.Ntg; i++){ + if(cudap1transptrs.Ms[i] != 0){ + hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T, + cudap1transptrs.Ks[i], cudap1transptrs.Ms[i], + &alpha, (const DeviceType*)cudap1transptrs.Abp[i], + cudap1transptrs.Ks[i], + (const DeviceType*)cudap1transptrs.xbp[i], 1, &beta, + cudap1transptrs.ybp[i], 1); + } + } + for(int streami=1; streami < stream_size; streami++){ + hipEventRecord(events[streami], streamptr[streami]); + } + for(int streami=1; streami < stream_size; streami++){ + hipStreamWaitEvent(streamptr[0], events[streami],0); + } + // phase 2 transpose + phase2_nosplit<DeviceType>(cudap1transptrs.y, d_phase2mapping_transpose, + cudap3transptrs.x, config.granksum, streamptr[0]); + hipEventRecord(events[0], streamptr[0]); + for(int streami=1; streami < stream_size; streami++){ + hipStreamWaitEvent(streamptr[streami], events[0],0); + } + // phase 3 transpose + for(int i=0; i<config.Mtg; i++){ + hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T, + cudap3transptrs.Ks[i], cudap3transptrs.Ms[i], + &alpha, (const DeviceType*)cudap3transptrs.Abp[i], + cudap3transptrs.Ks[i], + (const DeviceType*)cudap3transptrs.xbp[i], 1, &beta,cudap3transptrs.ybp[i], 1); + } + // final merge + for(int streami=1; streami < stream_size; streami++){ + hipEventRecord(events[stream_size + streami], streamptr[streami]); + } + for(int streami=1; streami < stream_size; streami++){ + hipStreamWaitEvent(streamptr[0], events[stream_size + streami],0); + } + hipStreamEndCapture(streamptr[0], &graph); + hipGraphInstantiate(&instance, graph, nullptr, nullptr, 0); + graphCreated = true; + } + hipGraphLaunch(instance, streamptr[0]); + hipStreamSynchronize(streamptr[0]); + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType, DeviceType>::MVMGraphNoTranspose() { + if(!graphCreated){ + hipStreamBeginCapture(streamptr[0],hipStreamCaptureModeGlobal); + hipEventRecord(event_start, streamptr[0]); + for(int streami=1; streami<stream_size; streami++){ + hipStreamWaitEvent(streamptr[streami], event_start,0); + } + // phase 1 + for(int i=0; i<config.Ntg; i++){ + if(cudap1ptrs.Ms[i] != 0){ + hipblasgemm(cublashandleptr[i%stream_size],HIPBLAS_OP_N, HIPBLAS_OP_N, + cudap1ptrs.Ms[i], cudap1ptrs.Ns[i], cudap1ptrs.Ks[i], + &alpha, (const DeviceType*)cudap1ptrs.Abp[i], cudap1ptrs.Ms[i], + (const DeviceType*)cudap1ptrs.xbp[i], cudap1ptrs.Ks[i], + &beta, cudap1ptrs.ybp[i], cudap1ptrs.Ms[i]); +// hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N, +// cudap1ptrs.Ms[i], cudap1ptrs.Ks[i], +// &alpha, (const DeviceType*)cudap1ptrs.Abp[i], cudap1ptrs.Ms[i], +// (const DeviceType*)cudap1ptrs.xbp[i], 1, &beta, +// cudap1ptrs.ybp[i], 1); + } + } + for(int streami=1; streami < stream_size; streami++){ + hipEventRecord(events[streami], streamptr[streami]); + } + for(int streami=1; streami < stream_size; streami++){ + hipStreamWaitEvent(streamptr[0], events[streami],0); + } + // phase 2 + phase2_nosplit<DeviceType>(cudap1ptrs.y, d_phase2mapping, cudap3ptrs.x, + config.granksum, streamptr[0]); + hipEventRecord(events[0], streamptr[0]); + for(int streami=1; streami < stream_size; streami++){ + hipStreamWaitEvent(streamptr[streami], events[0],0); + } + // phase 3 + for(int i=0; i<config.Mtg; i++){ + hipblasgemm(cublashandleptr[i%stream_size],HIPBLAS_OP_N, HIPBLAS_OP_N, + cudap3ptrs.Ms[i], cudap3ptrs.Ns[i], cudap3ptrs.Ks[i], + &alpha, (const DeviceType*)cudap3ptrs.Abp[i], cudap3ptrs.Ms[i], + (const DeviceType*)cudap3ptrs.xbp[i], cudap3ptrs.Ks[i], + &beta,cudap3ptrs.ybp[i], cudap3ptrs.Ms[i]); +// hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N, +// cudap3ptrs.Ms[i], cudap3ptrs.Ks[i], +// &alpha, (const DeviceType*)cudap3ptrs.Abp[i], cudap3ptrs.Ms[i], +// (const DeviceType*)cudap3ptrs.xbp[i], 1, &beta,cudap3ptrs.ybp[i], 1); + } + // final merge + for(int streami=1; streami < stream_size; streami++){ + hipEventRecord(events[stream_size + streami], streamptr[streami]); + } + for(int streami=1; streami < stream_size; streami++){ + hipStreamWaitEvent(streamptr[0], events[stream_size + streami],0); + } + hipStreamEndCapture(streamptr[0], &graph); + hipGraphInstantiate(&instance, graph, nullptr, nullptr, 0); + graphCreated = true; + } + hipGraphLaunch(instance, streamptr[0]); + hipStreamSynchronize(streamptr[0]); + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::MVMGraph(){ + if(transpose){ + MVMGraphTranspose(); + }else{ + MVMGraphNoTranspose(); + } + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::MVMTranspose() + { + hipDeviceSynchronize(); + for(int i=0; i<config.Ntg; i++){ + if(cudap1transptrs.Ms[i] != 0){ + hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T, + cudap1transptrs.Ks[i], cudap1transptrs.Ms[i], + &alpha, (const DeviceType*)cudap1transptrs.Abp[i], + cudap1transptrs.Ks[i], + (const DeviceType*)cudap1transptrs.xbp[i], 1, &beta, + cudap1transptrs.ybp[i], 1); + } + } + hipDeviceSynchronize(); + phase2_nosplit<DeviceType>(cudap1transptrs.y, d_phase2mapping_transpose, + cudap3transptrs.x, config.granksum, streamptr[0]); + hipDeviceSynchronize(); + for(int i=0; i<config.Mtg; i++){ + hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T, + cudap3transptrs.Ks[i], cudap3transptrs.Ms[i], + &alpha, (const DeviceType*)cudap3transptrs.Abp[i], + cudap3transptrs.Ks[i], + (const DeviceType*)cudap3transptrs.xbp[i], 1, &beta,cudap3transptrs.ybp[i], 1); + } + hipDeviceSynchronize(); + } + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType,DeviceType>::MVMNoTranspose() + { + hipDeviceSynchronize(); + for(int i=0; i<config.Ntg; i++){ + if(cudap1ptrs.Ms[i] != 0){ + hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N, + cudap1ptrs.Ms[i], cudap1ptrs.Ks[i], + &alpha, (const DeviceType*)cudap1ptrs.Abp[i], cudap1ptrs.Ms[i], + (const DeviceType*)cudap1ptrs.xbp[i], 1, &beta, + cudap1ptrs.ybp[i], 1); + } + } + hipDeviceSynchronize(); + phase2_nosplit<DeviceType>(cudap1ptrs.y, d_phase2mapping, cudap3ptrs.x, + config.granksum, streamptr[0]); + hipDeviceSynchronize(); + for(int i=0; i<config.Mtg; i++){ + hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N, + cudap3ptrs.Ms[i], cudap3ptrs.Ks[i], + &alpha, (const DeviceType*)cudap3ptrs.Abp[i], cudap3ptrs.Ms[i], + (const DeviceType*)cudap3ptrs.xbp[i], 1, &beta,cudap3ptrs.ybp[i], 1); + } + hipDeviceSynchronize(); + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType, DeviceType>::CopyBackResults() + { + // use cpu pointers to send output + if(transpose){ + CopyDataB2HD(tlrmvmcpu->p1transptrs.y, (HostType*)cudap1transptrs.y, tlrmvmcpu->config.granksum); + CopyDataB2HD(tlrmvmcpu->p3transptrs.x, (HostType*)cudap3transptrs.x, tlrmvmcpu->config.granksum); + CopyDataB2HD(tlrmvmcpu->p3transptrs.y, (HostType*)cudap3transptrs.y, tlrmvmcpu->config.originM); + }else{ + CopyDataB2HD(tlrmvmcpu->p1ptrs.y, (HostType*)cudap1ptrs.y, tlrmvmcpu->config.granksum); + CopyDataB2HD(tlrmvmcpu->p3ptrs.x, (HostType*)cudap3ptrs.x, tlrmvmcpu->config.granksum); + CopyDataB2HD(tlrmvmcpu->p3ptrs.y, (HostType*)cudap3ptrs.y, tlrmvmcpu->config.originM); + } + tlrmvmcpu->CopyToFinalresults(); + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType, DeviceType>::MVM() { + if(transpose){ + MVMTranspose(); + }else{ + MVMNoTranspose(); + } + } + + template<typename HostType, typename DeviceType> + void Tlrmvmhip<HostType, DeviceType>::SetTransposeConjugate(bool transpose, bool conjugate) { + this->transpose = transpose; + this->conjugate = conjugate; + tlrmvmcpu->SetTransposeConjugate(transpose, conjugate); + } + + + + template class Tlrmvmhip<float, float>; + template class Tlrmvmhip<double, double>; + template class Tlrmvmhip<complex<float>, hipComplex>; + template class Tlrmvmhip<complex<double>, hipDoubleComplex>; + +} // namespace cudatlrmvm + diff --git a/src/tlrmvm/hip/Tlrmvmhip.hpp b/src/tlrmvm/hip/Tlrmvmhip.hpp new file mode 100644 index 0000000000000000000000000000000000000000..631a0dfeecb7de1a1e09d9bfccfe93eccdf0ca87 --- /dev/null +++ b/src/tlrmvm/hip/Tlrmvmhip.hpp @@ -0,0 +1,114 @@ +#pragma once + +#include "../../common/Common.hpp" +#include "../cpu/TlrmvmCPU.hpp" +#include "../../common/hip/Util.hpp" +#include <cassert> +#include <hip/hip_runtime.h> +#include <hip/hip_runtime_api.h> +#include <hipblas.h> +#include <memory> + +namespace hiptlrmvm +{ + template<typename T> + struct HIPPhasePointers{ + HIPPhasePointers(); + size_t Acnt; + size_t Xcnt; + size_t Ycnt; + vector<size_t> Ms; + vector<size_t> Ks; + vector<size_t> Ns; + T *A; + T *x; + T *y; + T **Abp; + T **xbp; + T **ybp; + }; + + template<typename SrcType, typename DestType> + void PhasePointersCopyNonPointers(HIPPhasePointers<DestType> &dest, const PhasePointers<SrcType> &src); + + // Tlrmvm cuda is only responsible for cuda memory ops. + // Any host memory related ops should go to CPU instance. + template<typename HostType, typename DeviceType> + class Tlrmvmhip + { + public: + explicit Tlrmvmhip(TlrmvmConfig tlrmvmconfig); + Tlrmvmhip(); + void UpdateConfig(TlrmvmConfig &tlrmvmConfig); + void SetTransposeConjugate(bool transpose, bool conjugate); + void StreamInit(int streamsize); + void StreamDestroy(); + void MemoryInit(); + void MemoryFree(); + void Phase1(); + void Phase1Transpose(); + void Phase1GetMembuffer(); + void AllocatePhase1Buffer(); + void Phase1CopyData(); + void Phase1GetMembufferTranspose(); + void AllocatePhase1BufferTranspose(); + void Phase1CopyDataTranspose(); + void Phase2(); + void Phase2Transpose(); + void Phase2Prepare(); + void Phase2PrepareTranspose(); + void Phase3(); + void Phase3Transpose(); + void Phase3GetMembuffer(); + void AllocatePhase3Buffer(); + void Phase3CopyData(); + void Phase3GetMembufferTranspose(); + void AllocatePhase3BufferTranspose(); + void Phase3CopyDataTranspose(); + void MVM(); + void MVMTranspose(); + void MVMNoTranspose(); + void MVMGraph(); + void MVMGraphTranspose(); + void MVMGraphNoTranspose(); + void setX(HostType * xvector, size_t xlength); + // seperate 2 functions. + void TryConjugateXvec(); + void TryConjugateResults(); + void CopyBackResults(); + + bool transpose; + bool conjugate; + // cpu instance + TlrmvmConfig config; + shared_ptr<TlrmvmCPU<HostType>> tlrmvmcpu; + // GPU resources + hipStream_t * streamptr; + hipblasHandle_t * cublashandleptr; + int stream_size; + hipGraph_t graph; + bool graphCreated; + hipGraphExec_t instance; + hipEvent_t *events; + hipEvent_t event_start; + hipEvent_t event_phase2finish; + + DeviceType alpha; + DeviceType beta; + + // gpu pointers + HIPPhasePointers<DeviceType> cudap1ptrs; + HIPPhasePointers<DeviceType> cudap1transptrs; + size_t *d_phase2mapping; + size_t *d_phase2mapping_transpose; + HIPPhasePointers<DeviceType> cudap3ptrs; + HIPPhasePointers<DeviceType> cudap3transptrs; + }; + + + + +} // + + + diff --git a/src/tlrmvm/hip/TlrmvmhipConstRank.cpp b/src/tlrmvm/hip/TlrmvmhipConstRank.cpp new file mode 100644 index 0000000000000000000000000000000000000000..379f2ff4909c3f0837f945b525557ae2cfc1e0d1 --- /dev/null +++ b/src/tlrmvm/hip/TlrmvmhipConstRank.cpp @@ -0,0 +1,484 @@ +#include <hip/hip_runtime.h> +#include <hip/hip_runtime_api.h> +#include <hipblas.h> + +#include "../../common/Common.hpp" +#include "../../common/AppUtil.hpp" +#include "../cpu/TlrmvmCPU.hpp" +#include "Tlrmvmhip.hpp" +#include "TlrmvmhipConstRank.hpp" +#include "hipkernel.cuh" + +namespace hiptlrmvm +{ + template<typename HostType, typename DeviceType> + TlrmvmhipConstRank<HostType, DeviceType>::TlrmvmhipConstRank() {} + + template<typename HostType, typename DeviceType> + TlrmvmhipConstRank<HostType, DeviceType>::TlrmvmhipConstRank(TlrmvmConfig tlrmvmconfig) + :config(tlrmvmconfig) + { + transpose = false; + conjugate = false; + init_alpha_beta(alpha, beta); + tlrmvmcpu = std::make_shared<TlrmvmCPU<HostType>>(tlrmvmconfig); + } + + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType, DeviceType>::UpdateConfig(TlrmvmConfig &tlrmvmconfig) + { +// transpose = false; +// conjugate = false; +// init_alpha_beta(alpha, beta); +// tlrmvmcpu->UpdateConfig(tlrmvmconfig); + cout << "UpdateConfig not implemented." << endl; + exit(0); + } + + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::setX(HostType * xvector, size_t xlength){ + tlrmvmcpu->setX(xvector, xlength); + tlrmvmcpu->TryConjugateXvec(); + } + + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::TryConjugateXvec() { + // no transpose logic + tlrmvmcpu->TryConjugateXvec(); + CopyDataB2HD((HostType*)this->cudap1ptrs.x, tlrmvmcpu->p1ptrs.x, tlrmvmcpu->xmat.Shape()[0]); + } + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::TryConjugateResults() { + if(!conjugate) return; + if(transpose){ + ConjugateDriver<DeviceType>(cudap3transptrs.y, config.originN, stream); + CopyDataB2HD(tlrmvmcpu->p3transptrs.y, (HostType*)cudap3transptrs.y, tlrmvmcpu->config.originM); + }else{ + ConjugateDriver<DeviceType>(cudap3ptrs.y, config.originM, stream); + CopyDataB2HD(tlrmvmcpu->p3ptrs.y, (HostType*)cudap3ptrs.y, tlrmvmcpu->config.originM); + } + } + + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::StreamInit(int streamsize){ + hipStreamCreateWithFlags(&stream, hipStreamNonBlocking); + hipblasCreate(&cublashandle); + hipblasSetStream(cublashandle, stream); + } + + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::StreamDestroy(){ + hipblasDestroy(cublashandle); + hipStreamDestroy(stream); + } + + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::MemoryInit(){ + tlrmvmcpu->MemoryInit(); + PhasePointersCopyNonPointers<HostType, DeviceType>(cudap1ptrs, tlrmvmcpu->p1ptrs); + PhasePointersCopyNonPointers<HostType, DeviceType>(cudap3ptrs, tlrmvmcpu->p3ptrs); + PhasePointersCopyNonPointers<HostType, DeviceType>(cudap1transptrs, tlrmvmcpu->p1transptrs); + PhasePointersCopyNonPointers<HostType, DeviceType>(cudap3transptrs, tlrmvmcpu->p3transptrs); + Phase1GetMembuffer(); + AllocatePhase1Buffer(); + Phase1CopyData(); + Phase2Prepare(); + Phase3GetMembuffer(); + AllocatePhase3Buffer(); + Phase3CopyData(); + // transpose + Phase1GetMembufferTranspose(); + AllocatePhase1BufferTranspose(); + Phase1CopyDataTranspose(); + Phase2PrepareTranspose(); + Phase3GetMembufferTranspose(); + AllocatePhase3BufferTranspose(); + Phase3CopyDataTranspose(); + + // init batch pointers + GetDeviceMemory(&d_p1Aptrs, cudap1ptrs.Ms.size()); + GetDeviceMemory(&d_p1xptrs, cudap1ptrs.Ms.size()); + GetDeviceMemory(&d_p1yptrs, cudap1ptrs.Ms.size()); + GetDeviceMemory(&d_p3Aptrs, cudap3ptrs.Ms.size()); + GetDeviceMemory(&d_p3xptrs, cudap3ptrs.Ms.size()); + GetDeviceMemory(&d_p3yptrs, cudap3ptrs.Ms.size()); + + CopyDataB2HD((HostType**)d_p1Aptrs, (HostType**)cudap1ptrs.Abp, cudap1ptrs.Ms.size()); + CopyDataB2HD((HostType**)d_p1xptrs, (HostType**)cudap1ptrs.xbp, cudap1ptrs.Ms.size()); + CopyDataB2HD((HostType**)d_p1yptrs, (HostType**)cudap1ptrs.ybp, cudap1ptrs.Ms.size()); + CopyDataB2HD((HostType**)d_p3Aptrs, (HostType**)cudap3ptrs.Abp, cudap3ptrs.Ms.size()); + CopyDataB2HD((HostType**)d_p3xptrs, (HostType**)cudap3ptrs.xbp, cudap3ptrs.Ms.size()); + CopyDataB2HD((HostType**)d_p3yptrs, (HostType**)cudap3ptrs.ybp, cudap3ptrs.Ms.size()); + + GetDeviceMemory(&d_p1transAptrs, cudap1transptrs.Ms.size()); + GetDeviceMemory(&d_p1transxptrs, cudap1transptrs.Ms.size()); + GetDeviceMemory(&d_p1transyptrs, cudap1transptrs.Ms.size()); + GetDeviceMemory(&d_p3transAptrs, cudap3transptrs.Ms.size()); + GetDeviceMemory(&d_p3transxptrs, cudap3transptrs.Ms.size()); + GetDeviceMemory(&d_p3transyptrs, cudap3transptrs.Ms.size()); + + CopyDataB2HD((HostType**)d_p1transAptrs, (HostType**)cudap1transptrs.Abp, cudap1transptrs.Ms.size()); + CopyDataB2HD((HostType**)d_p1transxptrs, (HostType**)cudap1transptrs.xbp, cudap1transptrs.Ms.size()); + CopyDataB2HD((HostType**)d_p1transyptrs, (HostType**)cudap1transptrs.ybp, cudap1transptrs.Ms.size()); + CopyDataB2HD((HostType**)d_p3transAptrs, (HostType**)cudap3transptrs.Abp, cudap3transptrs.Ms.size()); + CopyDataB2HD((HostType**)d_p3transxptrs, (HostType**)cudap3transptrs.xbp, cudap3transptrs.Ms.size()); + CopyDataB2HD((HostType**)d_p3transyptrs, (HostType**)cudap3transptrs.ybp, cudap3transptrs.Ms.size()); + } + + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::MemoryFree(){ + tlrmvmcpu->MemoryFree(); + FreehipHostMemory(cudap1ptrs.Abp); + FreehipHostMemory(cudap1ptrs.xbp); + FreehipHostMemory(cudap1ptrs.ybp); + FreeDeviceMemory(cudap1ptrs.A); + FreeDeviceMemory(cudap1ptrs.x); + FreeDeviceMemory(cudap1ptrs.y); + + FreehipHostMemory(cudap3ptrs.Abp); + FreehipHostMemory(cudap3ptrs.xbp); + FreehipHostMemory(cudap3ptrs.ybp); + FreeDeviceMemory(cudap3ptrs.A); + FreeDeviceMemory(cudap3ptrs.x); + FreeDeviceMemory(cudap3ptrs.y); + + FreehipHostMemory(cudap1transptrs.Abp); + FreehipHostMemory(cudap1transptrs.xbp); + FreehipHostMemory(cudap1transptrs.ybp); + FreeDeviceMemory(cudap1transptrs.y); + + FreehipHostMemory(cudap3transptrs.Abp); + FreehipHostMemory(cudap3transptrs.xbp); + FreehipHostMemory(cudap3transptrs.ybp); + FreeDeviceMemory(cudap3transptrs.y); + + FreeDeviceMemory(d_p1Aptrs); + FreeDeviceMemory(d_p1xptrs); + FreeDeviceMemory(d_p1yptrs); + FreeDeviceMemory(d_p3Aptrs); + FreeDeviceMemory(d_p3xptrs); + FreeDeviceMemory(d_p3yptrs); + + FreeDeviceMemory(d_p1transAptrs); + FreeDeviceMemory(d_p1transxptrs); + FreeDeviceMemory(d_p1transyptrs); + FreeDeviceMemory(d_p3transAptrs); + FreeDeviceMemory(d_p3transxptrs); + FreeDeviceMemory(d_p3transyptrs); + } + + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::Phase1(){ + hipDeviceSynchronize(); + for(int i=0; i<config.Ntg; i++){ + if(cudap1ptrs.Ms[i] != 0){ + hipblasgemv(cublashandle, HIPBLAS_OP_N, + cudap1ptrs.Ms[i], cudap1ptrs.Ks[i], + &alpha, (const DeviceType*)cudap1ptrs.Abp[i], cudap1ptrs.Ms[i], + (const DeviceType*)cudap1ptrs.xbp[i], 1, &beta, + cudap1ptrs.ybp[i], 1); + } + } + hipDeviceSynchronize(); + } + + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::Phase1Transpose(){ + hipDeviceSynchronize(); + for(int i=0; i<config.Ntg; i++){ + if(cudap1transptrs.Ms[i] != 0){ + hipblasgemv(cublashandle, HIPBLAS_OP_T, + cudap1transptrs.Ks[i], cudap1transptrs.Ms[i], + &alpha, (const DeviceType*)cudap1transptrs.Abp[i], + cudap1transptrs.Ks[i], + (const DeviceType*)cudap1transptrs.xbp[i], 1, &beta, + cudap1transptrs.ybp[i], 1); + } + } + hipDeviceSynchronize(); + } + + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::Phase1GetMembuffer(){ + int batchsize = cudap1ptrs.Ms.size(); + GethipHostMemory(&cudap1ptrs.Abp, batchsize); + GethipHostMemory(&cudap1ptrs.xbp, batchsize); + GethipHostMemory(&cudap1ptrs.ybp, batchsize); + } + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::Phase1GetMembufferTranspose() + { + int batchsize = cudap1ptrs.Ms.size(); + GethipHostMemory(&cudap1transptrs.Abp, batchsize); + GethipHostMemory(&cudap1transptrs.xbp, batchsize); + GethipHostMemory(&cudap1transptrs.ybp, batchsize); + } + + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::AllocatePhase1Buffer(){ + GetDeviceMemory(&cudap1ptrs.A, cudap1ptrs.Acnt); + GetDeviceMemory(&cudap1ptrs.x, cudap1ptrs.Xcnt); + GetDeviceMemory(&cudap1ptrs.y, cudap1ptrs.Ycnt); + cudap1ptrs.Abp[0] = cudap1ptrs.A; + cudap1ptrs.xbp[0] = cudap1ptrs.x; + cudap1ptrs.ybp[0] = cudap1ptrs.y; + } + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::AllocatePhase1BufferTranspose(){ + cudap1transptrs.A = cudap3ptrs.A; + cudap1transptrs.x = cudap1ptrs.x; + GetDeviceMemory(&cudap1transptrs.y, cudap1transptrs.Ycnt); + cudap1transptrs.Abp[0] = cudap3ptrs.A; // use phase 3, U bases + cudap1transptrs.xbp[0] = cudap1ptrs.x; // use phase 1, x + cudap1transptrs.ybp[0] = cudap1transptrs.y; // create a new buffer + } + + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::Phase1CopyData(){ + auto AvMs = cudap1ptrs.Ms; + auto AvNs = cudap1ptrs.Ns; + auto AvKs = cudap1ptrs.Ks; + for(int i=1; i<config.Ntg; i++){ + size_t AvMK = AvMs[i-1] * AvKs[i-1]; + size_t AvKN = AvKs[i-1] * AvNs[i-1]; + size_t AvMN = AvMs[i-1] * AvNs[i-1]; + cudap1ptrs.Abp[i] =cudap1ptrs.Abp[i-1] + AvMK; + cudap1ptrs.xbp[i] = cudap1ptrs.xbp[i-1] + AvKN; + cudap1ptrs.ybp[i] = cudap1ptrs.ybp[i-1] + AvMN; + } + // load phase1 A,x to GPU + CopyDataB2HD((HostType*)cudap1ptrs.A, tlrmvmcpu->p1ptrs.A, tlrmvmcpu->p1ptrs.Acnt); + CopyDataB2HD((HostType*)cudap1ptrs.x, tlrmvmcpu->p1ptrs.x, tlrmvmcpu->p1ptrs.Xcnt); + } + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::Phase1CopyDataTranspose(){ + for(int i=1; i<cudap1transptrs.Ms.size(); i++){ + size_t AvMK = cudap1transptrs.Ms[i-1] * cudap1transptrs.Ks[i-1]; + size_t AvKN = cudap1transptrs.Ks[i-1] * cudap1transptrs.Ns[i-1]; + size_t AvMN = cudap1transptrs.Ms[i-1] * cudap1transptrs.Ns[i-1]; + cudap1transptrs.Abp[i] = cudap1transptrs.Abp[i-1] + AvMK; + cudap1transptrs.xbp[i] = cudap1transptrs.xbp[i-1] + AvKN; + cudap1transptrs.ybp[i] = cudap1transptrs.ybp[i-1] + AvMN; + } + // no need to copy data. data is copied in normal node. + } + + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::Phase2(){ + hipDeviceSynchronize(); + phase2_nosplit<DeviceType>(cudap1ptrs.y, d_phase2mapping, cudap3ptrs.x, + config.granksum, stream); + hipDeviceSynchronize(); + } + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::Phase2Transpose(){ + hipDeviceSynchronize(); + phase2_nosplit<DeviceType>(cudap1transptrs.y, d_phase2mapping_transpose, + cudap3transptrs.x, config.granksum, stream); + hipDeviceSynchronize(); + } + + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::Phase2Prepare(){ + GetDeviceMemory(&d_phase2mapping, tlrmvmcpu->h_phase2mapping.size()); + CopyDataB2HD(d_phase2mapping, tlrmvmcpu->h_phase2mapping.data(),tlrmvmcpu->h_phase2mapping.size()); + } + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::Phase2PrepareTranspose(){ + GetDeviceMemory(&d_phase2mapping_transpose, tlrmvmcpu->h_phase2mappingTranspose.size()); + CopyDataB2HD(d_phase2mapping_transpose, tlrmvmcpu->h_phase2mappingTranspose.data(), + tlrmvmcpu->h_phase2mappingTranspose.size()); + } + + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::Phase3(){ + hipDeviceSynchronize(); + for(int i=0; i<config.Mtg; i++){ + hipblasgemv(cublashandle, HIPBLAS_OP_N, + cudap3ptrs.Ms[i], cudap3ptrs.Ks[i], + &alpha, (const DeviceType*)cudap3ptrs.Abp[i], cudap3ptrs.Ms[i], + (const DeviceType*)cudap3ptrs.xbp[i], 1, &beta,cudap3ptrs.ybp[i], 1); + } + hipDeviceSynchronize(); + } + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::Phase3Transpose(){ + hipDeviceSynchronize(); + for(int i=0; i<config.Mtg; i++){ + hipblasgemv(cublashandle, HIPBLAS_OP_T, + cudap3transptrs.Ks[i], cudap3transptrs.Ms[i], + &alpha, (const DeviceType*)cudap3transptrs.Abp[i], + cudap3transptrs.Ks[i], + (const DeviceType*)cudap3transptrs.xbp[i], 1, &beta,cudap3transptrs.ybp[i], 1); + } + hipDeviceSynchronize(); + } + + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::Phase3GetMembuffer(){ + int batchsize = cudap3ptrs.Ms.size(); + GethipHostMemory(&cudap3ptrs.Abp, batchsize); + GethipHostMemory(&cudap3ptrs.xbp, batchsize); + GethipHostMemory(&cudap3ptrs.ybp, batchsize); + } + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::Phase3GetMembufferTranspose(){ + int batchsize = cudap3transptrs.Ms.size(); + GethipHostMemory(&cudap3transptrs.Abp, batchsize); + GethipHostMemory(&cudap3transptrs.xbp, batchsize); + GethipHostMemory(&cudap3transptrs.ybp, batchsize); + } + + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::AllocatePhase3Buffer(){ + GetDeviceMemory(&cudap3ptrs.A, cudap3ptrs.Acnt); + GetDeviceMemory(&cudap3ptrs.x, cudap3ptrs.Xcnt); + GetDeviceMemory(&cudap3ptrs.y, cudap3ptrs.Ycnt); + cudap3ptrs.Abp[0] = cudap3ptrs.A; + cudap3ptrs.xbp[0] = cudap3ptrs.x; + cudap3ptrs.ybp[0] = cudap3ptrs.y; + } + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::AllocatePhase3BufferTranspose(){ + cudap3transptrs.A = cudap1ptrs.A; + cudap3transptrs.x = cudap3ptrs.x; + GetDeviceMemory(&cudap3transptrs.y, cudap3transptrs.Ycnt); + cudap3transptrs.Abp[0] = cudap1ptrs.A; // use phase 1, V bases + cudap3transptrs.xbp[0] = cudap3ptrs.x; // use phase 3, x + cudap3transptrs.ybp[0] = cudap3transptrs.y; // create a new buffer + } + + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::Phase3CopyData(){ + auto AuMs = cudap3ptrs.Ms; + auto AuNs = cudap3ptrs.Ns; + auto AuKs = cudap3ptrs.Ks; + for(int i=1; i<tlrmvmcpu->config.Mtg; i++){ + size_t AuMK = AuMs[i-1] * AuKs[i-1]; + size_t AuKN = AuKs[i-1] * AuNs[i-1]; + size_t AuMN = AuMs[i-1] * AuNs[i-1]; + cudap3ptrs.Abp[i] = cudap3ptrs.Abp[i-1] + AuMK; + cudap3ptrs.xbp[i] = cudap3ptrs.xbp[i-1] + AuKN; + cudap3ptrs.ybp[i] = cudap3ptrs.ybp[i-1] + AuMN; + } + // load phase 3 A to GPU + CopyDataB2HD((HostType*)cudap3ptrs.A, tlrmvmcpu->p3ptrs.A, cudap3ptrs.Acnt); + } + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::Phase3CopyDataTranspose(){ + for(int i=1; i<cudap3transptrs.Ms.size(); i++){ + size_t AvMK = cudap3transptrs.Ms[i-1] * cudap3transptrs.Ks[i-1]; + size_t AvKN = cudap3transptrs.Ks[i-1] * cudap3transptrs.Ns[i-1]; + size_t AvMN = cudap3transptrs.Ms[i-1] * cudap3transptrs.Ns[i-1]; + cudap3transptrs.Abp[i] = cudap3transptrs.Abp[i-1] + AvMK; + cudap3transptrs.xbp[i] = cudap3transptrs.xbp[i-1] + AvKN; + cudap3transptrs.ybp[i] = cudap3transptrs.ybp[i-1] + AvMN; + } + // no need to copy data. + } + + + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::MVMTranspose() + { + hipDeviceSynchronize(); + for(int i=0; i<config.Ntg; i++){ + if(cudap1transptrs.Ms[i] != 0){ + hipblasgemv(cublashandle, HIPBLAS_OP_T, + cudap1transptrs.Ks[i], cudap1transptrs.Ms[i], + &alpha, (const DeviceType*)cudap1transptrs.Abp[i], + cudap1transptrs.Ks[i], + (const DeviceType*)cudap1transptrs.xbp[i], 1, &beta, + cudap1transptrs.ybp[i], 1); + } + } + hipDeviceSynchronize(); + phase2_nosplit<DeviceType>(cudap1transptrs.y, d_phase2mapping_transpose, + cudap3transptrs.x, config.granksum, stream); + hipDeviceSynchronize(); + for(int i=0; i<config.Mtg; i++){ + hipblasgemv(cublashandle, HIPBLAS_OP_T, + cudap3transptrs.Ks[i], cudap3transptrs.Ms[i], + &alpha, (const DeviceType*)cudap3transptrs.Abp[i], + cudap3transptrs.Ks[i], + (const DeviceType*)cudap3transptrs.xbp[i], 1, &beta,cudap3transptrs.ybp[i], 1); + } + hipDeviceSynchronize(); + } + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType,DeviceType>::MVMNoTranspose() + { + hipDeviceSynchronize(); + hipblasgemmbatched(cublashandle, HIPBLAS_OP_N, HIPBLAS_OP_N, + cudap1ptrs.Ms[0],cudap1ptrs.Ns[0],cudap1ptrs.Ks[0], + &alpha, (const DeviceType**)d_p1Aptrs, cudap1ptrs.Ms[0], + (const DeviceType**)d_p1xptrs, cudap1ptrs.Ks[0], + &beta,d_p1yptrs, cudap1ptrs.Ms[0], cudap1ptrs.Ms.size()); +// for(int i=0; i<config.Ntg; i++){ +// if(cudap1ptrs.Ms[i] != 0){ +// hipblasgemv(cublashandle, HIPBLAS_OP_N, +// cudap1ptrs.Ms[i], cudap1ptrs.Ks[i], +// &alpha, (const DeviceType*)cudap1ptrs.Abp[i], cudap1ptrs.Ms[i], +// (const DeviceType*)cudap1ptrs.xbp[i], 1, &beta, +// cudap1ptrs.ybp[i], 1); +// } +// } + hipDeviceSynchronize(); + phase2_nosplit<DeviceType>(cudap1ptrs.y, d_phase2mapping, cudap3ptrs.x, + config.granksum, stream); + hipDeviceSynchronize(); + hipblasgemmbatched(cublashandle, HIPBLAS_OP_N, HIPBLAS_OP_N, + cudap3ptrs.Ms[0],cudap3ptrs.Ns[0],cudap3ptrs.Ks[0], + &alpha, (const DeviceType**)d_p3Aptrs, cudap3ptrs.Ms[0], + (const DeviceType**)d_p3xptrs, cudap3ptrs.Ks[0], + &beta,d_p3yptrs, cudap3ptrs.Ms[0], cudap3ptrs.Ms.size()); +// for(int i=0; i<config.Mtg; i++){ +// hipblasgemv(cublashandle, HIPBLAS_OP_N, +// cudap3ptrs.Ms[i], cudap3ptrs.Ks[i], +// &alpha, (const DeviceType*)cudap3ptrs.Abp[i], cudap3ptrs.Ms[i], +// (const DeviceType*)cudap3ptrs.xbp[i], 1, &beta,cudap3ptrs.ybp[i], 1); +// } + hipDeviceSynchronize(); + } + + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType, DeviceType>::CopyBackResults() + { + // use cpu pointers to send output + if(transpose){ + CopyDataB2HD(tlrmvmcpu->p1transptrs.y, (HostType*)cudap1transptrs.y, tlrmvmcpu->config.granksum); + CopyDataB2HD(tlrmvmcpu->p3transptrs.x, (HostType*)cudap3transptrs.x, tlrmvmcpu->config.granksum); + CopyDataB2HD(tlrmvmcpu->p3transptrs.y, (HostType*)cudap3transptrs.y, tlrmvmcpu->config.originM); + }else{ + CopyDataB2HD(tlrmvmcpu->p1ptrs.y, (HostType*)cudap1ptrs.y, tlrmvmcpu->config.granksum); + CopyDataB2HD(tlrmvmcpu->p3ptrs.x, (HostType*)cudap3ptrs.x, tlrmvmcpu->config.granksum); + CopyDataB2HD(tlrmvmcpu->p3ptrs.y, (HostType*)cudap3ptrs.y, tlrmvmcpu->config.originM); + } + tlrmvmcpu->CopyToFinalresults(); + } + + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType, DeviceType>::MVM() { + if(transpose){ + MVMTranspose(); + }else{ + MVMNoTranspose(); + } + } + + template<typename HostType, typename DeviceType> + void TlrmvmhipConstRank<HostType, DeviceType>::SetTransposeConjugate(bool transpose, bool conjugate) { + this->transpose = transpose; + this->conjugate = conjugate; + tlrmvmcpu->SetTransposeConjugate(transpose, conjugate); + } + + + + template class TlrmvmhipConstRank<float, float>; + template class TlrmvmhipConstRank<double, double>; + template class TlrmvmhipConstRank<complex<float>, hipComplex>; + template class TlrmvmhipConstRank<complex<double>, hipDoubleComplex>; + +} // namespace cudatlrmvm + diff --git a/src/tlrmvm/hip/TlrmvmhipConstRank.hpp b/src/tlrmvm/hip/TlrmvmhipConstRank.hpp new file mode 100644 index 0000000000000000000000000000000000000000..f58b8da5e7d1a083d0c3641630c7f46365056556 --- /dev/null +++ b/src/tlrmvm/hip/TlrmvmhipConstRank.hpp @@ -0,0 +1,100 @@ +#pragma once + +#include "../../common/Common.hpp" +#include "../cpu/TlrmvmCPU.hpp" +#include "../../common/hip/Util.hpp" +#include "Tlrmvmhip.hpp" +#include <cassert> +#include <hip/hip_runtime.h> +#include <hip/hip_runtime_api.h> +#include <hipblas.h> +#include <memory> + +namespace hiptlrmvm +{ + + template<typename HostType, typename DeviceType> + class TlrmvmhipConstRank + { + public: + explicit TlrmvmhipConstRank(TlrmvmConfig tlrmvmconfig); + TlrmvmhipConstRank(); + void UpdateConfig(TlrmvmConfig &tlrmvmConfig); + void SetTransposeConjugate(bool transpose, bool conjugate); + void StreamInit(int streamsize); + void StreamDestroy(); + void MemoryInit(); + void MemoryFree(); + void Phase1(); + void Phase1Transpose(); + void Phase1GetMembuffer(); + void AllocatePhase1Buffer(); + void Phase1CopyData(); + void Phase1GetMembufferTranspose(); + void AllocatePhase1BufferTranspose(); + void Phase1CopyDataTranspose(); + void Phase2(); + void Phase2Transpose(); + void Phase2Prepare(); + void Phase2PrepareTranspose(); + void Phase3(); + void Phase3Transpose(); + void Phase3GetMembuffer(); + void AllocatePhase3Buffer(); + void Phase3CopyData(); + void Phase3GetMembufferTranspose(); + void AllocatePhase3BufferTranspose(); + void Phase3CopyDataTranspose(); + void MVM(); + void MVMTranspose(); + void MVMNoTranspose(); + void MVMGraph(); + void MVMGraphTranspose(); + void MVMGraphNoTranspose(); + void setX(HostType * xvector, size_t xlength); + // seperate 2 functions. + void TryConjugateXvec(); + void TryConjugateResults(); + void CopyBackResults(); + + bool transpose; + bool conjugate; + // cpu instance + TlrmvmConfig config; + shared_ptr<TlrmvmCPU<HostType>> tlrmvmcpu; + // GPU resources + hipStream_t stream; + hipblasHandle_t cublashandle; + DeviceType alpha; + DeviceType beta; + + // gpu pointers + HIPPhasePointers<DeviceType> cudap1ptrs; + HIPPhasePointers<DeviceType> cudap1transptrs; + size_t *d_phase2mapping; + size_t *d_phase2mapping_transpose; + HIPPhasePointers<DeviceType> cudap3ptrs; + HIPPhasePointers<DeviceType> cudap3transptrs; + + DeviceType **d_p1Aptrs; + DeviceType **d_p1xptrs; + DeviceType **d_p1yptrs; + DeviceType **d_p3Aptrs; + DeviceType **d_p3xptrs; + DeviceType **d_p3yptrs; + + DeviceType **d_p1transAptrs; + DeviceType **d_p1transxptrs; + DeviceType **d_p1transyptrs; + DeviceType **d_p3transAptrs; + DeviceType **d_p3transxptrs; + DeviceType **d_p3transyptrs; + }; + + + + +} // + + + diff --git a/src/tlrmvm/hip/hipkernel.cpp b/src/tlrmvm/hip/hipkernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..bdd78bb10f0fe6603ffb5736974a74ca9d92c7bc --- /dev/null +++ b/src/tlrmvm/hip/hipkernel.cpp @@ -0,0 +1,74 @@ +// +// Created by Yuxi Hong on 28/02/2022. +// + +#include "../../common/hip/Util.hpp" +#include "hipkernel.cuh" + + +namespace hiptlrmvm { + + template<typename T> + __global__ void phase2_nosplit_kernel(const T * __restrict__ yv, + const size_t * __restrict__ phase2mapping, T * __restrict__ yu, size_t len) + { + size_t thread_x = blockDim.x * blockIdx.x + threadIdx.x; + if (thread_x < len){ + yu[phase2mapping[thread_x]] = yv[thread_x]; + } + } + + template<typename T> + void phase2_nosplit(const T *yv, const size_t * phase2mapping, T * yu, size_t len, hipStream_t stream){ + int dimx = 512; + int griddimx = (len+dimx-1) / dimx; + phase2_nosplit_kernel<<<griddimx, dimx, 0, stream>>>(yv, phase2mapping, yu, len); + HIPCHECK(hipGetLastError()); + } + + template void phase2_nosplit<float>(const float*, const size_t *, float *, size_t, hipStream_t); + template void phase2_nosplit<double>(const double*, const size_t *, double *, size_t, hipStream_t); + template void phase2_nosplit<hipDoubleComplex>(const hipDoubleComplex*, const size_t *, + hipDoubleComplex *, size_t, hipStream_t); + template void phase2_nosplit<hipComplex>(const hipComplex*, const size_t *, hipComplex *, size_t, hipStream_t); +// template void phase2_nosplit<cuHalfComplex>(const cuHalfComplex*, const size_t *, cuHalfComplex *, size_t, hipStream_t); + + + __forceinline__ __device__ float conj(float Invec){ + return Invec; + } + __forceinline__ __device__ double conj(double Invec){ + return Invec; + } + __forceinline__ __device__ hipComplex conj(hipComplex Invec){ + return {Invec.x, -Invec.y}; + } + __forceinline__ __device__ hipDoubleComplex conj(hipDoubleComplex Invec){ + return {Invec.x, -Invec.y}; + } + + template<typename T> + __global__ void ConjugateKernel(T *Invec, size_t length) + { + size_t thread_x = blockDim.x * blockIdx.x + threadIdx.x; + if (thread_x < length){ + Invec[thread_x] = conj(Invec[thread_x]); + } + } + + template<typename T> + void ConjugateDriver(T *Invec, size_t length, hipStream_t stream){ + int dimx = 512; + int griddimx = (length+dimx-1) / dimx; + ConjugateKernel<<<griddimx, dimx, 0, stream>>>(Invec, length); + HIPCHECK(hipGetLastError()); + } + template void ConjugateDriver<float>(float *Invec, size_t length, hipStream_t stream); + template void ConjugateDriver<double>(double *Invec, size_t length, hipStream_t stream); + + template void ConjugateDriver<hipComplex>(hipComplex *Invec, size_t length, hipStream_t stream); + template void ConjugateDriver<hipDoubleComplex>(hipDoubleComplex *Invec, size_t length, hipStream_t stream); + + +} // namespace + diff --git a/src/tlrmvm/hip/hipkernel.cuh b/src/tlrmvm/hip/hipkernel.cuh new file mode 100644 index 0000000000000000000000000000000000000000..0549467f9edc22b4b935fcdc26688bde18ca75c5 --- /dev/null +++ b/src/tlrmvm/hip/hipkernel.cuh @@ -0,0 +1,22 @@ +#pragma once + +#include <iostream> +#include <complex> +#include <hip/hip_runtime.h> +#include <hip/hip_fp16.h> +#include <hip/hip_runtime.h> + + +namespace hiptlrmvm{ + + // normal phase 2 + template<typename T> + void phase2_nosplit(const T *yv, const size_t * phase2mapping, T * yu, size_t len, hipStream_t stream); + + // in-place conjugate convert + template<typename T> + void ConjugateDriver(T *Invec, size_t length, hipStream_t stream); + + + +} // namespace diff --git a/src/tlrmvm/hip/tlrmvmhiputil.cpp b/src/tlrmvm/hip/tlrmvmhiputil.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d4c9ca0c049ac5d13b1018b29fb97a31d8380df4 --- /dev/null +++ b/src/tlrmvm/hip/tlrmvmhiputil.cpp @@ -0,0 +1,91 @@ +// +// Created by Yuxi Hong on 08/04/2022. +// + +#include "tlrmvmhiputil.hpp" +#include <hip/hip_runtime.h> +#include <hip/hip_runtime_api.h> +#include <hipblas.h> + +namespace hiptlrmvm{ + + SingleGraph::SingleGraph() {} + + void SingleGraph::StreamInit(int streamsize) { + // single graph creation + this->streamsize = streamsize; + HIPCHECK(hipEventCreate(&event_start)); + HIPCHECK(hipEventCreate(&event_phase2finish)); + HIPCHECK(hipEventCreate(&event_phase1finish)); + events = new hipEvent_t[4*streamsize]; + for(int i=0; i<4*streamsize; i++) HIPCHECK(hipEventCreate(&events[i])); + graphCreated = false; + } + + void SingleGraph::syncallstreams(hipEvent_t *eventsptr, hipStream_t *streamptr, int stream_size) { + for(int streami=1; streami < stream_size; streami++){ + hipEventRecord(eventsptr[streami], streamptr[streami]); + } + for(int streami=1; streami < stream_size; streami++){ + hipStreamWaitEvent(streamptr[0], eventsptr[streami],0); + } + hipEventRecord(eventsptr[0], streamptr[0]); + for(int streami=1; streami < stream_size; streami++){ + hipStreamWaitEvent(streamptr[streami], eventsptr[0],0); + } + } + + void SingleGraph::syncstream0(hipEvent_t *eventsptr, hipStream_t *streamptr, int stream_size) { + for(int streami=1; streami < stream_size; streami++){ + hipEventRecord(eventsptr[streami], streamptr[streami]); + } + for(int streami=1; streami < stream_size; streami++){ + hipStreamWaitEvent(streamptr[0], eventsptr[streami],0); + } + } + + void SingleGraph::syncotherstreams(hipEvent_t event, hipStream_t * streamptr, int stream_size){ + hipEventRecord(event, streamptr[0]); + for(int streami=1; streami<stream_size; streami++){ + hipStreamWaitEvent(streamptr[streami], event,0); + } + } + + MultiGraph::MultiGraph() {} + + void MultiGraph::StreamInit(int batchsize, int streamsize) { + this->batchsize = batchsize; + this->streamsize = streamsize; + // multi graph creation + event_start.resize(batchsize); + event_phase2finish.resize(batchsize); + graphCreated.resize(batchsize); + instance.resize(batchsize); + graph.resize(batchsize); + events = new hipEvent_t*[batchsize]; + for(int bi=0; bi<batchsize; bi++){ + HIPCHECK(hipEventCreate(&event_start[bi])); + HIPCHECK(hipEventCreate(&event_phase2finish[bi])); + events[bi] = new hipEvent_t[4*streamsize]; + for(int i=0; i<4*streamsize; i++) HIPCHECK(hipEventCreate(&events[bi][i])); + graphCreated[bi] = false; + } + } + + CUDAI8basesPointers::CUDAI8basesPointers() {} + CUDAI8XPointers::CUDAI8XPointers(){} + + CBMaxInfo::CBMaxInfo() {maxA=maxx=maxy=maxbatchsize=0;} + + void getcomplexvectormax(complex<float> *hy, size_t xlength){ + double rmax = 0; + double imax = 0; + for(int i=0; i<xlength; i++){ + rmax = fmax(rmax, abs(hy[i].real())); + imax = fmax(imax, abs(hy[i].imag())); + } + cout << "rmax " << rmax << ", imax " << imax << endl; + } + +} + diff --git a/src/tlrmvm/hip/tlrmvmhiputil.hpp b/src/tlrmvm/hip/tlrmvmhiputil.hpp new file mode 100644 index 0000000000000000000000000000000000000000..d3bb82713f48c1595641b30d0fb30e773a0e9dc8 --- /dev/null +++ b/src/tlrmvm/hip/tlrmvmhiputil.hpp @@ -0,0 +1,93 @@ +#pragma once + +#include <vector> +using std::vector; + +#include "../cpu/TlrmvmCPU.hpp" + +#ifdef USE_MPI +#include <mpi.h> +#endif + +namespace hiptlrmvm { + + struct SingleGraph{ + SingleGraph(); + void StreamInit(int streamsize); + int streamsize; + hipGraph_t graph; + bool graphCreated; + hipGraphExec_t instance; + hipEvent_t* events; + hipEvent_t event_start; + hipEvent_t event_phase1finish; + hipEvent_t event_phase2finish; + void syncallstreams(hipEvent_t * events, hipStream_t * stream,int streamsize); + void syncstream0(hipEvent_t * events, hipStream_t * stream,int streamsize); + void syncotherstreams(hipEvent_t event, hipStream_t * stream,int streamsize); + }; + + struct MultiGraph{ + MultiGraph(); + void StreamInit(int batchsize, int streamsize); + int batchsize; + int streamsize; + vector<hipGraph_t> graph; + vector<bool> graphCreated; + vector<hipGraphExec_t> instance; + hipEvent_t* *events; + vector<hipEvent_t> event_start; + vector<hipEvent_t> event_phase2finish; + }; + + // BatchTlrmvmcudaINT8 + + struct CUDAI8basesPointers{ + CUDAI8basesPointers(); + size_t Acnt; + size_t Xcnt; + size_t Ycnt; + vector<size_t> Ms; + vector<size_t> Ks; + vector<size_t> Ns; + + hipInt8Complex * Abuffer; // real data buffer + vector<hipComplex> maxA; + hipComplex * maxA_device; // used to scale up to fp16 + vector<size_t> Aelems; // each gemv A elems + vector<size_t> Aelemsoffset; // each gemv A elems, prefix + size_t * Aelemsoffset_device; // used to scale up to fp16 + hipHalfComplex * ybuffer; // y buffer, alway a half buffer + + vector<size_t> xelems; // each gemv x elems + vector<size_t> xelemsoffset; // each gemv x elems, prefix + + vector<size_t> yelems; // each gemv y elems + vector<size_t> yelemsoffset; // each gemv y elems, prefix + + }; + + struct CUDAI8XPointers{ + CUDAI8XPointers(); + hipInt8Complex * xbuffer; + vector<hipComplex> maxx; + hipComplex * maxx_device; // used to scale up to fp16 + vector<size_t> xelems; // each gemv x elems + size_t* xelems_device; // each gemv x elems + vector<size_t> xelemsoffset; // each gemv x elems, prefix + size_t* xelemsoffset_device; // each gemv x elems, prefix + hipComplex *p3xreductionbuffer_device; + }; + + struct CBMaxInfo{ + CBMaxInfo(); + size_t maxA; + size_t maxx; + size_t maxy; + size_t maxbatchsize; + }; + + void getcomplexvectormax(complex<float> *hy, size_t xlength); + +} + diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 476d37e6804c5179fc87ba745cd3401a23cbcda4..5893bff94bee4d6360f8f9cb17d1bdcba882012b 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -44,3 +44,20 @@ if(BUILD_CUDA) install(TARGETS ${cbins} DESTINATION test) endforeach() endif() + +if(BUILD_HIP) + + set(BINS + Test_hip_hipblas + Test_hip_tlrmvm + Test_hip_constrank +# Test_hip_tlrmvmgraph + ) + foreach(cbins ${BINS}) + WrapHIPBinary(${cbins} hip) + install(TARGETS ${cbins} DESTINATION test) + endforeach() +# install(FILES +# ${CMAKE_CURRENT_LIST_DIR}/hip/Test_hip_tlrmvm_correctness.sh +# DESTINATION test PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE) +endif() \ No newline at end of file diff --git a/test/cpp/ex2mpitlrmvm_complexfloat.cpp b/test/cpp/ex2mpitlrmvm_complexfloat.cpp index edb150dbfec609640799f45fa9bbbc583b42c6a6..902b25fb0acf36084f5762af96f55d0d7fbb2da9 100644 --- a/test/cpp/ex2mpitlrmvm_complexfloat.cpp +++ b/test/cpp/ex2mpitlrmvm_complexfloat.cpp @@ -1,17 +1,17 @@ #include <string> #include <vector> #include <chrono> -#include <memory.h> #include <algorithm> #include <mpi.h> - #include <common/Common.hpp> #include <tlrmvm/Tlrmvm.hpp> +#define real complex<float> using namespace std; int main (int argc, char ** argv){ int originM; int originN; int nb; + int loopsize; string acc; string datafolder; string problemname; @@ -22,7 +22,6 @@ int main (int argc, char ** argv){ vector<double> bandstat; double bytesprocessed; size_t granksum; - int loopsize; auto argparser = ArgsParser(argc, argv); originM = argparser.getint("M"); originN = argparser.getint("N"); @@ -44,18 +43,17 @@ int main (int argc, char ** argv){ maskmat.Fill(0); for(int i=0; i<tlrmvmconfig.Mtg; i++){ for(int j=0; j<tlrmvmconfig.Ntg; j++){ - if (j % size == rank ) + if (j % size == rank) maskmat.SetElem(i,j,1); } } tlrmvmconfig.UpdateMaskmat(maskmat); - TlrmvmCPU<complex<float>> tlrmvmptr(tlrmvmconfig); - auto finalbuffer = new complex<float>[tlrmvmptr.config.paddingM]; -// tlrmvmptr.xmat.Fill(0.001); - memset(finalbuffer, 0, sizeof(complex<float>) * tlrmvmptr.config.paddingM); + TlrmvmCPU<real> tlrmvmptr(tlrmvmconfig); + auto finalbuffer = new real[tlrmvmptr.config.paddingM]; + memset(finalbuffer, 0, sizeof(real) * tlrmvmptr.config.paddingM); tlrmvmptr.MemoryInit(); - auto curx = Matrix<complex<float>>(tlrmvmptr.config.originN, 1); - curx.Fill(complex<float>(0.1,1.0)); + auto curx = Matrix<real>(tlrmvmptr.config.originN, 1); + curx.Fill(real(0.1,1.0)); tlrmvmptr.setX(curx.RawPtr(), curx.Shape()[0]); for(int i=0; i<loopsize; i++){ MPI_Barrier(MPI_COMM_WORLD); @@ -75,32 +73,30 @@ int main (int argc, char ** argv){ timestat.size(), MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); if(rank == 0){ #ifndef USE_NEC - CFPPCMatrix seismicpcmat(datafolder, acc, nb, problemname, originM, originN); - seismicpcmat.setX(tlrmvmptr.xmat); - seismicpcmat.GetDense(); - Matrix<complex<float>> yv_pc = seismicpcmat.Phase1(); - auto hyv = Matrix<complex<float>>(tlrmvmptr.p1ptrs.y, tlrmvmptr.config.workmatgranksum, 1); + CFPPCMatrix pcmat(datafolder, acc, nb, problemname, originM, originN); + pcmat.setX(tlrmvmptr.xmat); + pcmat.GetDense(); + Matrix<real> yv_pc = pcmat.Phase1(); + auto hyv = Matrix<real>(tlrmvmptr.p1ptrs.y, tlrmvmptr.config.workmatgranksum, 1); // cout << " Phase 1 Correctness : " << hyv.allclose(yv_pc) << endl; - Matrix<complex<float>> yu_pc = seismicpcmat.Phase2(); - auto hyu = Matrix<complex<float>>(tlrmvmptr.p3ptrs.x, tlrmvmptr.config.workmatgranksum, 1); + Matrix<real> yu_pc = pcmat.Phase2(); + auto hyu = Matrix<real>(tlrmvmptr.p3ptrs.x, tlrmvmptr.config.workmatgranksum, 1); // cout << " Phase 2 Correctness : " << hyu.allclose(yu_pc) << endl; - Matrix<complex<float>> y_pc = seismicpcmat.Phase3(); - auto hy = Matrix<complex<float>>(finalbuffer, tlrmvmptr.config.originM, 1); - auto denseout = seismicpcmat.GetDense() * tlrmvmptr.xmat; + Matrix<real> y_pc = pcmat.Phase3(); + auto hy = Matrix<real>(finalbuffer, tlrmvmptr.config.originM, 1); + auto denseout = pcmat.GetDense() * tlrmvmptr.xmat; cout << " Check MPI Phase 3 Correctness : "<< hy.allclose(denseout) << endl; #endif std::sort(mergetime.begin(), mergetime.end()); int N = mergetime.size(); cout << "median " << mergetime[N / 2] * 1e6 << " us."<< endl; - double bytes = TLRMVMBytesProcessed<complex<float>>(tlrmvmptr.config.granksum, + double bytes = TLRMVMBytesProcessed<real>(tlrmvmptr.config.granksum, tlrmvmptr.config.nb, originM, originN); cout << "U and V bases size: " << bytes * 1e-6 << " MB." << endl; cout << "Bandwidth " << bytes / mergetime[N/2] * 1e-9 << " GB/s" << endl; } - delete[] finalbuffer; tlrmvmptr.MemoryFree(); + delete[] finalbuffer; MPI_Finalize(); return 0; } - - diff --git a/test/cpp/ex2mpitlrmvm_float.cpp b/test/cpp/ex2mpitlrmvm_float.cpp index 5685e0af5846ca72887cc91d31e99ef2a86d46dd..aad825c8e4b32e2f421550af7f773659f07dd784 100644 --- a/test/cpp/ex2mpitlrmvm_float.cpp +++ b/test/cpp/ex2mpitlrmvm_float.cpp @@ -1,11 +1,11 @@ #include <string> #include <vector> #include <chrono> - #include <algorithm> #include <mpi.h> #include <common/Common.hpp> #include <tlrmvm/Tlrmvm.hpp> +#define real float using namespace std; int main (int argc, char ** argv){ int originM; @@ -27,10 +27,10 @@ int main (int argc, char ** argv){ originN = argparser.getint("N"); nb = argparser.getint("nb"); loopsize = argparser.getint("loopsize"); - acc = argparser.getstring("errorthreshold"); - problemname = argparser.getstring("problemname"); + acc = argparser.getstring("threshold"); + problemname = argparser.getstring("problem"); datafolder = argparser.getstring("datafolder"); - char rpath[100]; + char rpath[300]; sprintf(rpath, "%s/%s_Rmat_nb%d_acc%s.bin", datafolder.c_str(), problemname.c_str(), nb, acc.c_str()); rankfile = string(rpath); TlrmvmConfig tlrmvmconfig(originM, originN, nb, datafolder, acc, problemname); @@ -47,21 +47,22 @@ int main (int argc, char ** argv){ maskmat.SetElem(i,j,1); } } - tlrmvmconfig.Maskmat = maskmat; - TlrmvmCPU<float> tlrmvmptr(tlrmvmconfig); - double bytes = TLRMVMBytesProcessed<float>(tlrmvmptr.config.granksum, - tlrmvmptr.config.nb, tlrmvmptr.config.paddingM, - tlrmvmptr.config.paddingN); + tlrmvmconfig.UpdateMaskmat(maskmat); + TlrmvmCPU<real> tlrmvmptr(tlrmvmconfig); + auto finalbuffer = new real[tlrmvmptr.config.paddingM]; + memset(finalbuffer, 0, sizeof(real) * tlrmvmptr.config.paddingM); tlrmvmptr.MemoryInit(); - auto finalbuffer = new float[tlrmvmptr.config.originM]; - for(int i=0; i<tlrmvmptr.config.originM; i++) finalbuffer[i] = 0.0; + auto curx = Matrix<real>(tlrmvmptr.config.originN, 1); + curx.Fill(0.1); + tlrmvmptr.setX(curx.RawPtr(), curx.Shape()[0]); for(int i=0; i<loopsize; i++){ MPI_Barrier(MPI_COMM_WORLD); auto start = std::chrono::steady_clock::now(); tlrmvmptr.MVM(); + tlrmvmptr.CopyToFinalresults(); MPI_Barrier(MPI_COMM_WORLD); MPI_Reduce(tlrmvmptr.finalresults, - finalbuffer, tlrmvmptr.config.originM, + finalbuffer, tlrmvmptr.config.paddingM, MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD); auto end = std::chrono::steady_clock::now(); auto elapsed_time = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count(); @@ -71,22 +72,25 @@ int main (int argc, char ** argv){ MPI_Allreduce(timestat.data(), mergetime.data(), timestat.size(), MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); if(rank == 0){ - FPPCMatrix seismicpcmat(datafolder, acc, nb, problemname, originM, originN); - seismicpcmat.setX(tlrmvmptr.xmat); - seismicpcmat.GetDense(); - Matrix<float> yv_pc = seismicpcmat.Phase1(); - auto hyv = Matrix<float>(tlrmvmptr.p1ptrs.y, tlrmvmptr.config.workmatgranksum, 1); - // cout << " Phase 1 Correctness : " << hyv.allclose(yv_pc) << endl; - Matrix<float> yu_pc = seismicpcmat.Phase2(); - auto hyu = Matrix<float>(tlrmvmptr.p3ptrs.x, tlrmvmptr.config.workmatgranksum, 1); - // cout << " Phase 2 Correctness : " << hyu.allclose(yu_pc) << endl; - Matrix<float> y_pc = seismicpcmat.Phase3(); - auto hy = Matrix<float>(tlrmvmptr.p3ptrs.y, tlrmvmptr.config.originM, 1); - cout << " Check MPI Phase 3 Correctness : "<< hy.allclose(y_pc) << endl; +#ifndef USE_NEC + FPPCMatrix pcmat(datafolder, acc, nb, problemname, originM, originN); + pcmat.setX(tlrmvmptr.xmat); + pcmat.GetDense(); + Matrix<real> yv_pc = pcmat.Phase1(); + auto hyv = Matrix<real>(tlrmvmptr.p1ptrs.y, tlrmvmptr.config.workmatgranksum, 1); +// cout << " Phase 1 Correctness : " << hyv.allclose(yv_pc) << endl; + Matrix<real> yu_pc = pcmat.Phase2(); + auto hyu = Matrix<real>(tlrmvmptr.p3ptrs.x, tlrmvmptr.config.workmatgranksum, 1); +// cout << " Phase 2 Correctness : " << hyu.allclose(yu_pc) << endl; + Matrix<real> y_pc = pcmat.Phase3(); + auto hy = Matrix<real>(finalbuffer, tlrmvmptr.config.originM, 1); + auto denseout = pcmat.GetDense() * tlrmvmptr.xmat; + cout << " Check MPI Phase 3 Correctness : "<< hy.allclose(denseout) << endl; +#endif std::sort(mergetime.begin(), mergetime.end()); int N = mergetime.size(); cout << "median " << mergetime[N / 2] * 1e6 << " us."<< endl; - double bytes = TLRMVMBytesProcessed<float>(tlrmvmptr.config.granksum, + double bytes = TLRMVMBytesProcessed<real>(tlrmvmptr.config.granksum, tlrmvmptr.config.nb, originM, originN); cout << "U and V bases size: " << bytes * 1e-6 << " MB." << endl; cout << "Bandwidth " << bytes / mergetime[N/2] * 1e-9 << " GB/s" << endl; @@ -96,5 +100,3 @@ int main (int argc, char ** argv){ MPI_Finalize(); return 0; } - - diff --git a/test/hip/Test_hip_constrank.cpp b/test/hip/Test_hip_constrank.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f094486e1afa86f97cd6efe9677615f21625c6a9 --- /dev/null +++ b/test/hip/Test_hip_constrank.cpp @@ -0,0 +1,58 @@ +#include <iostream> +#include <unistd.h> +#include <memory.h> +#include "common/Common.hpp" +#include "tlrmvm/Tlrmvm.hpp" + +using namespace hiptlrmvm; + +int main (int argc, char ** argv){ + auto argparser = ArgsParser(argc, argv); + auto originM = argparser.getint("M"); + auto originN = argparser.getint("N"); + auto nb = argparser.getint("nb"); + auto ranksize = argparser.getint("ranksize"); + auto loopsize = argparser.getint("loopsize"); + + // rank size should be smaller than nb. + TlrmvmConfig tlrmvmconfig(originM, originN, nb, ranksize); + /******************************** + * cuda instance + ********************************/ + TlrmvmhipConstRank<complex<float>, hipComplex> cudatlrmvmptr(tlrmvmconfig); + cudatlrmvmptr.StreamInit(0); + cudatlrmvmptr.MemoryInit(); + cudatlrmvmptr.SetTransposeConjugate(false, false); + cudatlrmvmptr.TryConjugateXvec(); + + // time + hipEvent_t start; + hipEvent_t stop; + hipEventCreate(&start); + hipEventCreate(&stop); + vector<double> rawtime; + float milliseconds = 0; + + for(int i=0; i<loopsize; i++){ + hipEventRecord(start); + // do the computation and send results back to cpu instance. + cudatlrmvmptr.MVM(); + hipEventRecord(stop); + hipEventSynchronize(stop); + hipEventElapsedTime(&milliseconds, start, stop); + rawtime.push_back(milliseconds * 1e-3); + } + + cudatlrmvmptr.TryConjugateResults(); + cudatlrmvmptr.CopyBackResults(); + + std::sort(rawtime.begin(), rawtime.end()); + int nruns = rawtime.size(); + cout << "Median Time " << rawtime[nruns/2] * 1e6 << " us."<< endl; + double bytes = TLRMVMBytesProcessed<complex<float>>(cudatlrmvmptr.config.granksum, nb, + originM, originN); + cout << "Bandwidth: " << bytes / rawtime[nruns/2] * 1e-9 << " GB/s." << endl; + + cudatlrmvmptr.MemoryFree(); + return 0; +} diff --git a/test/hip/Test_hip_hipblas.cpp b/test/hip/Test_hip_hipblas.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b06a1f3de013f2c13cf1b9e93499471638c85257 --- /dev/null +++ b/test/hip/Test_hip_hipblas.cpp @@ -0,0 +1,20 @@ +#include <hipblas.h> +#include <iostream> +#include <hip/hip_runtime.h> +using namespace std; +int main(){ + hipEvent_t start; + hipEvent_t stop; + hipEventCreate(&start); + hipEventCreate(&stop); + hipEventRecord(start); + for(int i=0; i<10; i++){ + cout << "hello" << endl; + } + hipEventRecord(stop); + hipEventSynchronize(stop); + float milliseconds = 0; + hipEventElapsedTime(&milliseconds, start, stop); + cout << "time "<< milliseconds << endl; + +} diff --git a/test/hip/Test_hip_tlrmvm.cpp b/test/hip/Test_hip_tlrmvm.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7398ad26325de513e9d666b5b8d7c03f89981ccf --- /dev/null +++ b/test/hip/Test_hip_tlrmvm.cpp @@ -0,0 +1,52 @@ +#include <iostream> +#include <unistd.h> +#include <memory.h> +#include "common/Common.hpp" +#include "tlrmvm/Tlrmvm.hpp" + +using namespace hiptlrmvm; + +int main (int argc, char ** argv){ + auto argparser = ArgsParser(argc, argv); + auto originM = argparser.getint("M"); + auto originN = argparser.getint("N"); + auto nb = argparser.getint("nb"); + auto threshold = argparser.getstring("threshold"); + auto problem = argparser.getstring("problem"); + auto datafolder = argparser.getstring("datafolder"); + auto streams = argparser.getint("streams"); + TlrmvmConfig tlrmvmconfig(originM, originN, nb, datafolder, threshold, problem); + /******************************** + * cuda instance + ********************************/ + Tlrmvmhip<complex<float>, hipComplex> cudaptr(tlrmvmconfig); + cudaptr.StreamInit(streams); + cudaptr.MemoryInit(); + + // do the computation and send results back to cpu instance. + cudaptr.Phase1(); + cudaptr.Phase2(); + cudaptr.Phase3(); + cudaptr.CopyBackResults(); + + CFPPCMatrix seismicpcmat(datafolder, threshold, nb, problem, originM, originN); + auto tlrmvmcpu = cudaptr.tlrmvmcpu; + seismicpcmat.setX(cudaptr.tlrmvmcpu->xmat); + auto densemat = seismicpcmat.GetDense(); + Matrix<complex<float>> yv_pc = seismicpcmat.Phase1(); + auto hyv = Matrix<complex<float>>(tlrmvmcpu->p1ptrs.y, tlrmvmcpu->config.workmatgranksum, 1); + cout << "====================================================" << endl; + cout << "Test TLR-MVM CUDA Implementation. " << endl; + cout << " Phase 1 Correctness : " << hyv.allclose(yv_pc) << endl; + Matrix<complex<float>> yu_pc = seismicpcmat.Phase2(); + auto hyu = Matrix<complex<float>>(tlrmvmcpu->p3ptrs.x, tlrmvmcpu->config.workmatgranksum, 1); + cout << " Phase 2 Correctness : " << hyu.allclose(yu_pc) << endl; + Matrix<complex<float>> y_pc = seismicpcmat.Phase3(); + auto hy = Matrix<complex<float>>(tlrmvmcpu->p3ptrs.y, tlrmvmcpu->config.paddingM, 1); + cout << " Phase 3 Correctness : "<< hy.allclose(y_pc) << endl; + auto denseout = densemat * cudaptr.tlrmvmcpu->xmat; + cout << "dense results vs tlrmvm results " << hy.allclose(denseout) << endl; + cout << "====================================================" << endl; + cudaptr.MemoryFree(); + return 0; +} diff --git a/test/hip/Test_hip_tlrmvm_singlecall.cpp b/test/hip/Test_hip_tlrmvm_singlecall.cpp new file mode 100644 index 0000000000000000000000000000000000000000..261877f9c9b0b4a5717eccd494f94e260907a094 --- /dev/null +++ b/test/hip/Test_hip_tlrmvm_singlecall.cpp @@ -0,0 +1,44 @@ +#include <iostream> +#include <unistd.h> +#include <memory.h> +#include "common/Common.hpp" +#include "tlrmvm/Tlrmvm.hpp" + +using namespace hiptlrmvm; + +int main (int argc, char ** argv){ + auto argparser = ArgsParser(argc, argv); + auto originM = argparser.getint("M"); + auto originN = argparser.getint("N"); + auto nb = argparser.getint("nb"); + auto threshold = argparser.getstring("threshold"); + auto problem = argparser.getstring("problem"); + auto datafolder = argparser.getstring("datafolder"); + auto streams = argparser.getint("streams"); + TlrmvmConfig tlrmvmconfig(originM, originN, nb, datafolder, threshold, problem); + /******************************** + * cuda instance + ********************************/ + Tlrmvmhip<complex<float>, hipComplex> cudatlrmvmptr(tlrmvmconfig); + cudatlrmvmptr.StreamInit(streams); + cudatlrmvmptr.MemoryInit(); + cudatlrmvmptr.SetTransposeConjugate(false, false); + cudatlrmvmptr.TryConjugateXvec(); + + // do the computation and send results back to cpu instance. + cudatlrmvmptr.MVM(); + + cudatlrmvmptr.TryConjugateResults(); + cudatlrmvmptr.CopyBackResults(); + + CFPPCMatrix seismicpcmat(datafolder, threshold, nb, problem, originM, originN); + auto densemat = seismicpcmat.GetDense(); + auto hy = Matrix<complex<float>>(cudatlrmvmptr.tlrmvmcpu->finalresults, cudatlrmvmptr.tlrmvmcpu->config.originM, 1); + auto denseout = densemat * cudatlrmvmptr.tlrmvmcpu->xmat; + cout << "====================================================" << endl; + cout << "Test TLR-MVM conjugate single call Implementation. " << endl; + cout << "dense results vs tlrmvm results " << hy.allclose(denseout) << endl; + cout << "====================================================" << endl; + cudatlrmvmptr.MemoryFree(); + return 0; +} diff --git a/thirdparty/pybind11 b/thirdparty/pybind11 deleted file mode 160000 index 45f792efdd92da094548e2095d6efdbfa7e536ee..0000000000000000000000000000000000000000 --- a/thirdparty/pybind11 +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 45f792efdd92da094548e2095d6efdbfa7e536ee