diff --git a/.gitignore b/.gitignore
index 4d0b5bcc822490504c6e1f7ef47466593a8a25f3..7c053e1b7c0effcad7367aef687bf6032de04084 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,3 +30,4 @@ cudacomplex
 cudasingle
 amdcpucomplex
 amdcpusingle
+*build/
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index 613ca93c3fefc77fb4380b6b915c237182206cb5..0000000000000000000000000000000000000000
--- a/.gitmodules
+++ /dev/null
@@ -1,4 +0,0 @@
-[submodule "thirdparty/pybind11"]
-	path = thirdparty/pybind11
-	url = https://github.com/pybind/pybind11.git
-
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9cab794e33ff2a5a4e269ce29a52986331da3bc6..9df0cc7e9d8b211339a758731b574ccc01612297 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.19)
+cmake_minimum_required(VERSION 3.20)
 
 set(PROJECT_NAME TLRMVM)
 
@@ -35,7 +35,7 @@ set(CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib)
 option(BUILD_CPU "Build TLR-MVM using cpp" ON)
 option(BUILD_DPCPP "Build TLR-MVM on intel archs and use intel dpc++." OFF)
 option(BUILD_CUDA "Build TLR-MVM on NVIDIA gpu and cuda." OFF) # using NVIDIA GPU
-# option(BUILD_HIP "Build TLR-MVM on amd gpu and use hip." OFF) # using AMD GPU (AMD is under dev)
+ option(BUILD_HIP "Build TLR-MVM on amd gpu and use hip." OFF) # using AMD GPU (AMD is under dev)
 
 #########################
 # BLAS backend selection
@@ -190,18 +190,16 @@ endif() # BUILD_CUDA
 # HIP library
 #######################
 if(BUILD_HIP)
-    list(APPEND CMAKE_PREFIX_PATH $ENV{HIP_PATH} $ENV{ROCM_PATH}
-            $ENV{HIP_PATH}/hip $ENV{HIP_PATH}/llvm/lib/clang/14.0.0/lib/linux)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result ")
+    enable_language(HIP)
+    find_package(HIP REQUIRED)
     include(cmake/hiptlrmvm.cmake)
     add_library(tlrmvmhiplib SHARED ${CPU_HEADERS} ${CPU_SRCS} ${HIP_HEADERS} ${HIP_SRCS})
-    find_package(hip REQUIRED)
-    find_package(hipblas REQUIRED)
-    set(TLRMVM_LIBS ${TLRMVM_LIBS} hip::device roc::hipblas)
+#    find_package(hipblas REQUIRED)
+    set(TLRMVM_LIBS ${TLRMVM_LIBS} hip::device hip::host hipblas)
     target_include_directories(tlrmvmhiplib PUBLIC ${TLRMVM_INCS})
     target_link_libraries(tlrmvmhiplib PUBLIC ${TLRMVM_LIBS})
     AddCompileDefinitions(tlrmvmhiplib)
-    target_compile_definitions(tlrmvmhiplib PUBLIC -D__HIP_PLATFORM_HCC__=1) # for clion search ...
+#    target_compile_definitions(tlrmvmhiplib PUBLIC -D__HIP_PLATFORM_HCC__=1) # for clion search ...
     target_compile_definitions(tlrmvmhiplib PUBLIC USE_HIP)
     install(TARGETS tlrmvmhiplib DESTINATION lib)
 endif() # BUILD_HIP
@@ -212,7 +210,7 @@ endif() # BUILD_HIP
 #################
 if(BUILD_PYTHON AND (BUILD_CUDA OR BUILD_HIP))
     # now python is only available for CUDA and HIP build.
-    add_subdirectory(thirdparty/pybind11)
+    find_package(pybind11 REQUIRED)
     add_subdirectory(pytlrmvm)
 endif()
 
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..32483c58a65289d12c8f954774f2152290038666
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,24 @@
+Copyright (c) 2022, King Abdullah University of Science and Technology
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+ * Neither the name of the copyright holder nor the names of its
+   contributors may be used to endorse or promote products derived from
+   this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/README.md b/README.md
index 1de8f157e8d50d5e09f55f4bafa8b4ccc91cc5ad..7f598fb7e1190f26bad1ea03537c8d82624b6518 100644
--- a/README.md
+++ b/README.md
@@ -105,7 +105,10 @@ Compile and install
 ## 4. Test
 You also need to download the dataset to run the experiments.
 dataset download url:
-https://drive.google.com/drive/folders/1_DSyloFjlScXGTlA1_ybJnTne59tUpgR?usp=sharing
+1. seismic redatuming dataset
+https://zenodo.org/record/6582600
+2. MAVIS AO system matrcies dataset
+https://zenodo.org/record/7305622
 
 after download, put it in a seperate folder and set `WORK_ROOT` to that folder.
 
diff --git a/cmake/hiptlrmvm.cmake b/cmake/hiptlrmvm.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..33b4d2508b5b79be102be87b2a97aa9c7d83f8d6
--- /dev/null
+++ b/cmake/hiptlrmvm.cmake
@@ -0,0 +1,21 @@
+include(${PROJECT_SOURCE_DIR}/cmake/tlrmvm.cmake)
+set(HIP_SRCS)
+set(HIP_HEADERS)
+
+# Common
+file(GLOB TMP ${PROJECT_SOURCE_DIR}/src/common/hip/*.cpp)
+list(APPEND HIP_SRCS ${TMP})
+file(GLOB TMP ${PROJECT_SOURCE_DIR}/src/common/hip/*.cu)
+list(APPEND HIP_SRCS ${TMP})
+file(GLOB TMP ${PROJECT_SOURCE_DIR}/src/common/hip/*.hpp)
+list(APPEND HIP_HEADERS ${TMP})
+
+# tlrmvm
+file(GLOB TMP ${PROJECT_SOURCE_DIR}/src/tlrmvm/hip/*.cpp)
+list(APPEND HIP_SRCS ${TMP})
+file(GLOB TMP ${PROJECT_SOURCE_DIR}/src/tlrmvm/hip/*.cu)
+list(APPEND HIP_SRCS ${TMP})
+file(GLOB TMP ${PROJECT_SOURCE_DIR}/src/tlrmvm/hip/*.hpp)
+list(APPEND HIP_HEADERS ${TMP})
+file(GLOB TMP ${PROJECT_SOURCE_DIR}/src/tlrmvm/hip/*.cuh)
+list(APPEND HIP_HEADERS ${TMP})
\ No newline at end of file
diff --git a/pytlrmvm/convertmat2npy.py b/pytlrmvm/convertmat2npy.py
deleted file mode 100644
index 8d4a403d442a56f886e975ab73ecdb3e7e18dfcc..0000000000000000000000000000000000000000
--- a/pytlrmvm/convertmat2npy.py
+++ /dev/null
@@ -1,27 +0,0 @@
-##################################################################
-# @copyright (c) 2021- King Abdullah University of Science and
-#                      Technology (KAUST). All rights reserved.
-#
-# Author: Yuxi Hong, 2021.10.27
-# Description: Convert matlab .mat file to numpy npy.
-##################################################################
-import os
-from os.path import join
-from scipy.io import loadmat
-import numpy as np
-import pickle 
-import argparse
-import time
-
-parser = argparse.ArgumentParser()
-parser.add_argument('--filename', type=str, help='matfilename')
-parser.add_argument('--mat_root', type=str, help='matfile dir')
-parser.add_argument('--work_root', type=str, help='workspace dir')
-args = parser.parse_args()
-
-matname = join(args.mat_root, args.filename + '.mat')
-npyname = join(args.work_root, args.filename + '.npy')
-work_root = args.work_root
-A = loadmat(matname)['Rfreq']
-with open(npyname, 'wb') as f:
-    np.save(f, A)
diff --git a/pytlrmvm/generateseismicinput.py b/pytlrmvm/generateseismicinput.py
deleted file mode 100644
index f388c7c9665a8b4e3a50c86bcf393121e0d21a3c..0000000000000000000000000000000000000000
--- a/pytlrmvm/generateseismicinput.py
+++ /dev/null
@@ -1,77 +0,0 @@
-import numpy as np
-import sys
-import os
-import argparse
-import time
-
-parser = argparse.ArgumentParser()
-parser.add_argument('--TLRMVM_ROOT', type=str, help='installation dir')
-parser.add_argument('--WORK_ROOT', type=str, help='workspace dir')
-parser.add_argument('--nb', type=int, help='nb')
-parser.add_argument('--error_threshold', type=str,help='error threshold')
-parser.add_argument('--problemname', type=str, help='problem name')
-parser.add_argument('--datatype', type=str, help='datatype of dataset')
-
-args = parser.parse_args()
-
-print("Your installation path: ", args.TLRMVM_ROOT)
-print("Your workspace path: ", args.WORK_ROOT)
-
-# your TLRMVM_ROOT should point to your installation path.
-TLRMVM_ROOT = args.TLRMVM_ROOT
-sys.path.append( os.path.join( args.TLRMVM_ROOT,"python") )
-WORK_ROOT = args.WORK_ROOT
-print("Downloading dataset to path: {}".format( WORK_ROOT ))
-if not os.path.exists(WORK_ROOT):
-    os.mkdir(WORK_ROOT)
-problemname = args.problemname
-
-from tlrmvmtools import *
-dtype = None
-datatype = args.datatype
-if datatype == 'float':
-    dtype = np.float32
-elif datatype == 'double':
-    dtype = np.float64
-elif datatype == 'csingle':
-    dtype = np.csingle
-elif datatype == 'cdouble':
-    dtype = np.cdouble
-else:
-    print("Not support datatype.")
-    exit(1)
-
-# get your data matrix in numpy format
-# you can download the matrix used in this example at
-# https://drive.google.com/file/d/1KY4eeVSMm2mWOOKVU7QjsAf6tOREv-99/view?usp=sharing
-A = np.load(  os.path.join(WORK_ROOT, "{}.npy".format(problemname)) ).astype(dtype)
-
-if datatype in ['csingle', 'cdouble']:
-    randomx = np.random.rand(A.shape[1]) + 1j * np.random.rand(A.shape[1])
-    randomx = randomx.astype(dtype)
-else:
-    randomx = np.random.rand(A.shape[1])
-    randomx = randomx.astype(dtype)
-
-"""
-Below is needed for creating input of tlrmvm.
-They are parameters Tile size (nb) and Accuracy Threshold (error_threshold) 
-you can play with to get decent performance and numerical accuracy.
-"""
-m = A.shape[0]
-n = A.shape[1]
-nb = args.nb
-error_threshold = args.error_threshold # we use string for easy concatnating.
-workplacefolder = WORK_ROOT
-datasetname = args.problemname
-
-# create tlrmvm util class
-tlrmvmutil = TLRMVM_Util(A, nb, workplacefolder, error_threshold, datasetname)
-# compute svd and save
-tlrmvmutil.computesvd()
-# create input of tlrmvm
-tlrmvmutil.saveUV()
-# get compression info
-tlrmvmutil.printdatainfo()
-
-tlrmvmutil.saveX(randomx)
\ No newline at end of file
diff --git a/pytlrmvm/src/Wrapper.cpp b/pytlrmvm/src/Wrapper.cpp
index ffc5170617158710cf9c8e7b4a9e6dcc54e72d21..05ea3d204047ceeedd7c1f85d286129293cab2ef 100644
--- a/pytlrmvm/src/Wrapper.cpp
+++ b/pytlrmvm/src/Wrapper.cpp
@@ -157,7 +157,6 @@ PYBIND11_MODULE(TLRMVMpy, m) {
     m.def("BatchUpdatexgpu_INT8_cf", &BatchUpdatex_INT8);
     m.def("BatchUpdateygpu_INT8_cf", &BatchUpdatey_INT8);
 
-    addbatchtlrmvmgpufp16int8(m);
 
     m.def("SetMaskmat", &SetMaskmat);
 }
diff --git a/pytlrmvm/tlrmvmtools.py b/pytlrmvm/tlrmvmtools.py
index c8561451773cc4f08709dd343621fef5256e1a12..152c3192dba45008767e66f3e22a7d160ae1de4f 100644
--- a/pytlrmvm/tlrmvmtools.py
+++ b/pytlrmvm/tlrmvmtools.py
@@ -2,113 +2,118 @@
 # @copyright (c) 2021- King Abdullah University of Science and
 #                      Technology (KAUST). All rights reserved.
 #
-# Author: Yuxi Hong, 2021.10.27
-# Description: A tools for generating compressed U and V bases.
-# They are input of TLR-MVM.
+# Author: Yuxi Hong
+# Description: A tools for generating compressed U and V bases
+# which are input of TLR-MVM.
 ##################################################################
 import os
 from os.path import join, exists
 from tqdm import tqdm
 import numpy as np
-import pickle 
+import pickle
+from scipy.linalg import svd
+
 
 class TLRMVM_Util:
-    """A TLR-MVM Utility class 
+    """A TLR-MVM Utility class
     1. compute svd for input of TLR-MVM
     3. save U and V bases
     4. save Dense matrix
     """
+
     def __init__(self, denseAarray, nb, datafolder, error_threshold, problemname, rankmodule) -> None:
         self.denseA = denseAarray
         self.dtype = denseAarray.dtype
         self.m = denseAarray.shape[0]
         self.n = denseAarray.shape[1]
-        self.nb = nb 
+
+        self.nb = nb
         self.mtg = self.m // nb if self.m % nb == 0 else self.m // nb + 1
         self.ntg = self.n // nb if self.n % nb == 0 else self.n // nb + 1
-        self.paddingm = self.mtg * nb 
-        self.paddingn = self.ntg * nb 
+        self.paddingm = self.mtg * nb
+        self.paddingn = self.ntg * nb
         self.datafolder = datafolder
         if not exists(self.datafolder):
             print("Folder {} not exists!".format(self.datafolder))
         self.error_threshold = error_threshold
         self.problemname = problemname
         self.rankfile = join(self.datafolder, 'compresseddata',
-        '{}_Rmat_nb{}_acc{}.bin'.format(self.problemname,self.nb,self.error_threshold))
+                             '{}_Rmat_nb{}_acc{}.bin'.format(self.problemname, self.nb, self.error_threshold))
         self.rankmodule = rankmodule
 
     def computesvd(self):
         A = self.denseA
-        padding_m = self.paddingm 
+        padding_m = self.paddingm
         padding_n = self.paddingn
-        m = self.m 
-        n = self.n 
-        ntiles = self.ntg 
-        mtiles = self.mtg 
+        m = self.m
+        n = self.n
+        mtiles = self.mtg
+        ntiles = self.ntg
         svdsavepath = join(self.datafolder, 'SVDinfo')
         if not exists(svdsavepath):
             os.mkdir(svdsavepath)
-        nb = self.nb 
-        svdname = join( svdsavepath, '{}_nb{}.pickle'.format(self.problemname,nb) )
+        nb = self.nb
+        svdname = join(svdsavepath, '{}_nb{}.pickle'.format(self.problemname, nb))
         if exists(svdname):
             print("svd {} exists.".format(svdname))
-            return 
+            return
         else:
             print("save svd to {}. ".format(svdname))
         bigmap = dict()
-        padA = np.zeros((padding_m,padding_n),dtype=self.dtype)
-        padA[:m,:n] = A
+        padA = np.zeros((padding_m, padding_n), dtype=self.dtype)
+        padA[:m, :n] = A
         for j in tqdm(range(ntiles)):
             for i in range(mtiles):
-                curblock = padA[i*nb:(i+1)*nb, j*nb:(j+1)*nb]
-                [u,s,v]  = np.linalg.svd(curblock)
-                bigmap['{}_{}'.format(i,j)] = [u,s,v]
-        with open( svdname,'wb') as f:
-            pickle.dump(bigmap,  f)
+                curblock = padA[i * nb:(i + 1) * nb, j * nb:(j + 1) * nb]
+                [u, s, v] = svd(curblock)
+                bigmap['{}_{}'.format(i, j)] = [u, s, v]
+        with open(svdname, 'wb') as f:
+            pickle.dump(bigmap, f)
 
     def saveX(self, xvec):
         xfile = join(self.datafolder, '{}_x.bin'.format(self.problemname))
         xvec.tofile(xfile)
 
     def saveUV(self):
-        svdname = join( self.datafolder, 'SVDinfo', '{}_nb{}.pickle'.format(self.problemname,self.nb) )
+        svdname = join(self.datafolder, 'SVDinfo', '{}_nb{}.pickle'.format(self.problemname, self.nb))
         if not exists(svdname):
             print("please do svd to matrix first!")
         with open(svdname, 'rb') as f:
             bigmap = pickle.load(f)
-        nb = self.nb 
+        nb = self.nb
         acc = self.error_threshold
-        uvsavepath = join(self.datafolder,'compresseddata')
+        uvsavepath = join(self.datafolder, 'compresseddata')
         if not exists(uvsavepath):
             os.mkdir(uvsavepath)
-        ufile = uvsavepath + '/{}_Ubases_nb{}_acc{}.bin'.format(self.problemname,nb,acc)
-        vfile = uvsavepath + '/{}_Vbases_nb{}_acc{}.bin'.format(self.problemname,nb,acc)
-        rfile = uvsavepath + '/{}_Rmat_nb{}_acc{}.bin'.format(self.problemname,nb,acc)
-        if exists(ufile) and exists(vfile) and exists(rfile):
-            print("Compress file exists. ")
-            return
+        ufile = uvsavepath + '/{}_Ubases_nb{}_acc{}.bin'.format(self.problemname, nb, acc)
+        vfile = uvsavepath + '/{}_Vbases_nb{}_acc{}.bin'.format(self.problemname, nb, acc)
+        rfile = uvsavepath + '/{}_Rmat_nb{}_acc{}.bin'.format(self.problemname, nb, acc)
+
         print("generate uvr file to {}.".format(uvsavepath))
-        padding_m = self.paddingm 
+        padding_m = self.paddingm
         padding_n = self.paddingn
-        m = self.m 
-        n = self.n 
-        ntiles = self.ntg 
-        mtiles = self.mtg 
-        uvsavepath = self.datafolder
-        nb = self.nb 
+        m = self.m
+        n = self.n
+        ntiles = self.ntg
+        mtiles = self.mtg
+        nb = self.nb
         tmpacc = self.error_threshold
-        acc = tmpacc if isinstance(tmpacc,float) else float(tmpacc)
-        ApproximateA = np.zeros((padding_m, padding_n),dtype=self.dtype)
-        originpadA = np.zeros((padding_m, padding_n),dtype=self.dtype)
-        originpadA[:m,:n] = self.denseA
-        normA = np.linalg.norm(self.denseA,'fro')
-        ranklist = np.zeros(mtiles * ntiles,dtype=np.int32)
+        acc = tmpacc if isinstance(tmpacc, float) else float(tmpacc)
+        ApproximateA = np.zeros((padding_m, padding_n), dtype=self.dtype)
+        originpadA = np.zeros((padding_m, padding_n), dtype=self.dtype)
+        originpadA[:m, :n] = self.denseA
+        normA = np.linalg.norm(self.denseA, 'fro')
+        ranklist = np.zeros((mtiles, ntiles), dtype=np.int32)
+        print("rankmat shape, ", ranklist.shape)
         ulist = [[] for _ in range(mtiles)]
         vlist = [[] for _ in range(mtiles)]
-        for i in tqdm(range(mtiles)):
-            for j in range(ntiles):
-                curblock = originpadA[i*nb:(i+1)*nb, j*nb:(j+1)*nb] 
-                [u,s,v] = bigmap['{}_{}'.format(i,j)]
+
+        p = mtiles
+        for i in tqdm(range(mtiles - 1)):
+            for j in range(ntiles - 1):
+                curblock = originpadA[i * nb:(i + 1) * nb, j * nb:(j + 1) * nb]
+                normblock = np.linalg.norm(curblock, 'fro')
+                [u, s, v] = bigmap['{}_{}'.format(i, j)]
                 srk = 0
                 erk = nb
                 while srk != erk:
@@ -116,54 +121,169 @@ class TLRMVM_Util:
                     tmpu = u[:, :midrk]
                     tmps = s[:midrk]
                     tmpv = v[:midrk, :]
-                    if np.linalg.norm(curblock-(tmpu*tmps)@tmpv, ord='fro') < normA * acc:
+                    if np.linalg.norm(curblock - (tmpu * tmps) @ tmpv, ord='fro') < normA * acc:
                         erk = midrk
                     else:
-                        srk = midrk+1
-                if srk % self.rankmodule != 0:
-                    mod4srk = ((srk//self.rankmodule) + 1) * self.rankmodule
-                else:
-                    mod4srk = srk
-                mod4srk = min(mod4srk, nb)
-                srk = mod4srk
+                        srk = midrk + 1
+                if srk == 0:
+                    srk = 1
                 tmpu = u[:, :srk]
                 tmps = s[:srk]
                 tmpv = v[:srk, :]
+                ApproximateA[i * nb:(i + 1) * nb, j * nb:(j + 1) * nb] = (tmpu * tmps) @ tmpv
+                us = tmpu * tmps
+                vt = tmpv
                 if srk == 0:
-                    ApproximateA[i*nb:(i+1)*nb, j*nb:(j+1)*nb] = np.zeros((nb,nb),dtype=self.dtype)
+                    ranklist[i, j] = 1
+                    ulist[i].append(np.zeros((nb, 1), dtype=self.dtype))
+                    vlist[i].append(np.zeros((1, nb), dtype=self.dtype))
+                else:
+                    ranklist[i, j] = srk
+                    ulist[i].append(us)
+                    vlist[i].append(vt)
+
+        def getsrk(normA, nb, acc, u, s, v):
+            srk = 0
+            erk = nb
+            while srk != erk:
+                midrk = (srk + erk) // 2
+                tmpu = u[:, :midrk]
+                tmps = s[:midrk]
+                tmpv = v[:midrk, :]
+                if np.linalg.norm(curblock - (tmpu * tmps) @ tmpv, ord='fro') < normA * acc:
+                    erk = midrk
                 else:
-                    ApproximateA[i*nb:(i+1)*nb, j*nb:(j+1)*nb] = (tmpu*tmps)@tmpv
-                us = tmpu * tmps 
+                    srk = midrk + 1
+            return srk
+
+        for i in tqdm(range(mtiles)):
+            for j in range(ntiles):
+                if i < mtiles - 1 and j < ntiles - 1:
+                    continue
+                curblock = originpadA[i * nb:(i + 1) * nb, j * nb:(j + 1) * nb]
+                normblock = np.linalg.norm(curblock, 'fro')
+                [u, s, v] = bigmap['{}_{}'.format(i, j)]
+                if i < mtiles - 1 or j < ntiles - 1:
+                    if i == mtiles - 1:
+                        presum = np.sum(ranklist[:, j])
+                        srk = getsrk(normA, nb, acc, u, s, v)
+                        while srk < nb and (srk + presum) % self.rankmodule != 0:
+                            srk += 1
+
+                        if srk == nb and (srk + presum) % self.rankmodule != 0:
+                            print("can't find a solution! i = mtiles")
+                            exit()
+                        else:
+                            ranklist[i, j] = srk
+                    elif j == ntiles - 1:
+                        presum = np.sum(ranklist[i, :])
+                        srk = getsrk(normA, nb, acc, u, s, v)
+                        while srk < nb and (srk + presum) % self.rankmodule != 0:
+                            srk += 1
+                        if srk == nb and (srk + presum) % self.rankmodule != 0:
+                            print("can't find a solution! j = ntiles")
+                            exit()
+                        else:
+                            ranklist[i, j] = srk
+                elif i == mtiles - 1 and j == ntiles - 1:
+                    srk = 0
+                    while srk < nb and (srk + np.sum(ranklist[i, :])) % self.rankmodule != 0 and \
+                            (srk + np.sum(ranklist[:, j])) % self.rankmodule != 0:
+                        srk += 1
+                    if srk == nb:
+                        print("can't find a solution!")
+                        exit()
+                    else:
+                        ranklist[i, j] = srk
+                if srk == 0:
+                    srk = self.rankmodule
+                tmpu = u[:, :srk]
+                tmps = s[:srk]
+                tmpv = v[:srk, :]
+                ApproximateA[i * nb:(i + 1) * nb, j * nb:(j + 1) * nb] = (tmpu * tmps) @ tmpv
+                us = tmpu * tmps
                 vt = tmpv
                 if srk == 0:
-                    ranklist[j*mtiles+i] = 1
-                    ulist[i].append(np.zeros((nb,1),dtype=self.dtype))
-                    vlist[i].append(np.zeros((1,nb),dtype=self.dtype))
+                    ranklist[i, j] = 1
+                    ulist[i].append(np.zeros((nb, 1), dtype=self.dtype))
+                    vlist[i].append(np.zeros((1, nb), dtype=self.dtype))
                 else:
-                    ranklist[j*mtiles+i] = srk
+                    ranklist[i, j] = srk
                     ulist[i].append(us)
                     vlist[i].append(vt)
         tmpurow = []
         for x in ulist:
-            tmpurow.append(np.concatenate(x,axis=1))
-        finalu = np.concatenate(tmpurow,axis=1)
+            tmpurow.append(np.concatenate(x, axis=1))
+        finalu = np.concatenate(tmpurow, axis=1)
         finalu.T.tofile(ufile)
         tmpvcol = []
-        npvlist = np.array(vlist,dtype=np.object)
+        npvlist = np.array(vlist, dtype=np.object)
         for i in range(npvlist.shape[1]):
-            tmpvcol.append(np.concatenate(npvlist[:,i],axis=0))
-        
+            tmpvcol.append(np.concatenate(npvlist[:, i], axis=0))
+
         with open(vfile, 'wb') as f:
             for x in tmpvcol:
                 x.T.tofile(f)
-        ranklist.tofile(rfile)
+        ranklist.T.tofile(rfile)
 
     def printdatainfo(self):
         print("Description of Dataset: ")
-        print("problem name : {} ".format(self.problemname) )
+        print("problem name : {} ".format(self.problemname))
         print("m is {} n {} nb is {} error threshold is {}.".format(self.m, self.n, self.nb, self.error_threshold))
-        rankfile = join(self.datafolder, 'compresseddata', '{}_Rmat_nb{}_acc{}.bin'.format(self.problemname, self.nb, self.error_threshold))
+        rankfile = join(self.datafolder, 'compresseddata',
+                        '{}_Rmat_nb{}_acc{}.bin'.format(self.problemname, self.nb, self.error_threshold))
         self.ranklist = np.fromfile(rankfile, dtype=np.int32)
-        mn = self.m * self.n 
+        mn = self.m * self.n
         rank = np.sum(self.ranklist)
-        print("Global rank is {}, compression rate is {:.3f}%.".format( rank, 2*rank*self.nb / mn * 100))
+        print("Global rank is {}, compression rate is {:.3f}%.".format(rank, 2 * rank * self.nb / mn * 100))
+
+
+if __name__ == "__main__":
+    import numpy as np
+    import os
+    import argparse
+    from astropy.io.fits import open as fitsopen
+    from scipy.io import loadmat
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--nb', type=int, help='nb')
+    parser.add_argument('--error_threshold', type=str,help='error threshold.')
+    parser.add_argument('--compressed_name', type=str, help='The file name for compressed U,V,and R.')
+    parser.add_argument('--data_dir', type=str, help='your original data dir.')
+    parser.add_argument('--data_type', type=str, help='datatype of dataset.')
+    parser.add_argument('--data_name', type=str, help='The name of original matrix.')
+    parser.add_argument('--matlabmat_name', type=str, default=None, help='The name of original matrix in .mat file.')
+    parser.add_argument('--rank_module', type=int, help='rank module.')
+    args = parser.parse_args()
+    dtype = None
+    datatype = args.data_type
+    if datatype == 'float':
+        dtype = np.float32
+    elif datatype == 'double':
+        dtype = np.float64
+    elif datatype == 'csingle':
+        dtype = np.csingle
+    elif datatype == 'cdouble':
+        dtype = np.cdouble
+    else:
+        print("Not support datatype.")
+        exit(1)
+    A = None
+    if args.data_name.split('.')[-1] == 'npy':
+        A = np.load(join(args.data_dir,args.data_name)).astype(dtype)
+    elif args.data_name.split('.')[-1] == 'fits':
+        A = fitsopen(join(args.data_dir,args.data_name))[0].data.astype(dtype)
+    elif args.data_name.split('.')[-1] == 'mat':
+        A = loadmat(join(args.data_dir,args.data_name))[args.matlabmat_name]
+    else:
+        A = pickle.load(open(join(args.data_dir,args.data_name))).astype(dtype)
+    rankmodule = int(args.rank_module)
+    if rankmodule == 0:
+        print("not 0.")
+        exit()
+    tlrmvmutil = TLRMVM_Util(A, args.nb, args.data_dir, args.error_threshold, args.compressed_name, rankmodule)
+    # compute svd and save
+    tlrmvmutil.computesvd()
+    # create input of tlrmvm
+    tlrmvmutil.saveUV()
+    # get compression info
+    tlrmvmutil.printdatainfo()
diff --git a/setup.py b/setup.py
index 69e874385d96dc68dfe463c7b87bd04a770d7846..f61cd4740a54f4c111894721319be129ff8d8eb1 100644
--- a/setup.py
+++ b/setup.py
@@ -64,7 +64,7 @@ class build_ext(build_ext_orig):
                            "-DCMAKE_C_COMPILER={}".format(self.c_compiler),
                            "-DCMAKE_CXX_COMPILER={}".format(self.cxx_compiler),
                            "-DCMAKE_CUDA_HOST_COMPILER={}".format(self.cxx_compiler),
-                           "-DCMAKE_CUDA_FLAGS='-ccbin {}'".format(self.cxx_compiler),
+                           #"-DCMAKE_CUDA_FLAGS='-ccbin {}'".format(self.cxx_compiler),
                            "-DUSE_MKL=ON",
                            "-DUSE_MPI=ON",
                            "-DBUILD_CUDA=ON",
diff --git a/src/common/cpu/Util.cpp b/src/common/cpu/Util.cpp
index 78bac44298e95b55385257a70328de72a817f258..8fb2883c80b7958e4828f139bde660944f4ea4cc 100644
--- a/src/common/cpu/Util.cpp
+++ b/src/common/cpu/Util.cpp
@@ -137,22 +137,16 @@ ArgsParser::ArgsParser(int argc, char**argv){
 
 int ArgsParser::getint(string key){
     if(argmap.find(key) == argmap.end())
-    {cout << "key error in getint" << endl; exit(0);}
+    {cout << "key error in getint:" << key << endl; exit(0);}
     return atoi(argmap[key].c_str());
 }
 
 string ArgsParser::getstring(string key){
     if(argmap.find(key) == argmap.end())
-    {cout << "key error in getstring" << endl; exit(0);}
+    {cout << "key error in getstring: "<< key << endl; exit(0);}
     return argmap[key];
 }
 
-bool ArgsParser::getbool(string key){
-    if(argmap.find(key) == argmap.end())
-    {cout << "key error in getint" << endl; exit(0);}
-    return atoi(argmap[key].c_str());
-}
-
 
 
 template<typename T>
diff --git a/src/tlrmvm/cpu/TlrmvmCPU.cpp b/src/tlrmvm/cpu/TlrmvmCPU.cpp
index a8809225de20f3d6b3bbfc7b50d0604fa99978e9..d2e7c8c25edf2a62dc0a60411f66b39a94894a5a 100644
--- a/src/tlrmvm/cpu/TlrmvmCPU.cpp
+++ b/src/tlrmvm/cpu/TlrmvmCPU.cpp
@@ -7,6 +7,8 @@
 #include "../../common/AppUtil.hpp"
 #include "../../common/cpu/Util.hpp"
 #include <memory>
+#include <string.h>
+#include <stdio.h>
 #ifdef USE_MPI
 #include <mpi.h>
 #endif
@@ -124,7 +126,7 @@ void TlrmvmBase<T>::InitData(){
         RandomX(Datax, config.originN);
         this->xmat = Matrix<T>(Datax, config.paddingN, 1);
     }else{
-        char filename[200];
+        char filename[300];
         sprintf(filename, "%s/%s_Ubases_nb%d_acc%s.bin",
                 config.datafolder.c_str(), config.problemname.c_str(),config.nb, config.acc.c_str());
         size_t elems = config.granksum * config.nb;
@@ -187,10 +189,10 @@ void TlrmvmBase<T>::Phase1GetMembuffer(){
 
 template<typename T>
 void TlrmvmBase<T>::Phase1GetMembufferTranspose() {
-    GetHostMemory(&p1transptrs.Abp, config.Ntg);
-    GetHostMemory(&p1transptrs.xbp, config.Ntg);
-    GetHostMemory(&p1transptrs.ybp, config.Ntg);
-    for(int i=0; i<config.Ntg; i++){
+    GetHostMemory(&p1transptrs.Abp, config.Mtg);
+    GetHostMemory(&p1transptrs.xbp, config.Mtg);
+    GetHostMemory(&p1transptrs.ybp, config.Mtg);
+    for(int i=0; i<config.Mtg; i++){
         p1transptrs.Ms.push_back(config.rowsum[i]);
         p1transptrs.Ks.push_back(config.nb);
         p1transptrs.Ns.push_back(1);
@@ -198,7 +200,7 @@ void TlrmvmBase<T>::Phase1GetMembufferTranspose() {
     p1transptrs.Acnt = 0;
     p1transptrs.Xcnt = 0;
     p1transptrs.Ycnt = 0;
-    for(int i=0; i<config.Ntg; i++){
+    for(int i=0; i<config.Mtg; i++){
         p1transptrs.Acnt += p1transptrs.Ms[i] * p1transptrs.Ks[i];
         p1transptrs.Xcnt += p1transptrs.Ks[i] * p1transptrs.Ns[i];
         p1transptrs.Ycnt += p1transptrs.Ms[i] * p1transptrs.Ns[i];
@@ -347,10 +349,10 @@ void TlrmvmBase<T>::Phase2PrepareTranspose() {
 
 template<typename T>
 void TlrmvmBase<T>::Phase3GetMembuffer(){
-    GetHostMemory(&p3ptrs.Abp, config.Ntg);
-    GetHostMemory(&p3ptrs.xbp, config.Ntg);
-    GetHostMemory(&p3ptrs.ybp, config.Ntg);
-    for(int i=0; i<config.Ntg; i++){
+    GetHostMemory(&p3ptrs.Abp, config.Mtg);
+    GetHostMemory(&p3ptrs.xbp, config.Mtg);
+    GetHostMemory(&p3ptrs.ybp, config.Mtg);
+    for(int i=0; i<config.Mtg; i++){
         p3ptrs.Ms.push_back(config.nb);
         p3ptrs.Ks.push_back(config.rowsum[i]);
         p3ptrs.Ns.push_back(1);
@@ -358,7 +360,7 @@ void TlrmvmBase<T>::Phase3GetMembuffer(){
     p3ptrs.Acnt = 0;
     p3ptrs.Xcnt = 0;
     p3ptrs.Ycnt = 0;
-    for(int i=0; i<config.Ntg; i++){
+    for(int i=0; i<config.Mtg; i++){
         p3ptrs.Acnt += p3ptrs.Ms[i] * p3ptrs.Ks[i];
         p3ptrs.Xcnt += p3ptrs.Ks[i] * p3ptrs.Ns[i];
         p3ptrs.Ycnt += p3ptrs.Ms[i] * p3ptrs.Ns[i];
diff --git a/src/tlrmvm/hip/BatchTlrmvmhip.cpp b/src/tlrmvm/hip/BatchTlrmvmhip.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..835d647ef0123a3a5446ccd856d4990a2e6cc107
--- /dev/null
+++ b/src/tlrmvm/hip/BatchTlrmvmhip.cpp
@@ -0,0 +1,712 @@
+//
+// Created by Yuxi Hong on 02/03/2022.
+//
+
+#include "../../common/Common.hpp"
+#include "../cpu/TlrmvmCPU.hpp"
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+#include "BatchTlrmvmhip.hpp"
+#include "hipkernel.cuh"
+#include <chrono>
+
+namespace hiptlrmvm
+{
+    template<typename HostType, typename DeviceType>
+    BatchTlrmvmhip<HostType, DeviceType>::BatchTlrmvmhip(vector<TlrmvmConfig> tlrmvmconfigvec)
+            :config_vec(tlrmvmconfigvec),batchsize(tlrmvmconfigvec.size())
+    {
+        cout << "calling Batch Tlrmvmcuda" << endl;
+#ifdef USE_MPI
+        int initflag;
+        MPI_Initialized(&initflag);
+        if(initflag == 1){
+            int rank;
+            int size;
+            MPI_Comm_rank(MPI_COMM_WORLD,&rank);
+            MPI_Comm_size(MPI_COMM_WORLD,&size);
+            if(rank == 0)
+                cout << "we are in mpi environment:" << endl;
+            int totaldevcount = 0;
+            HIPCHECK(hipGetDeviceCount(&totaldevcount));
+            if(totaldevcount < size){
+                if(rank == 0)
+                    cout << "not enough cards, in debug mode, set all to 0." << endl;
+                HIPCHECK(hipSetDevice(0));
+            }else{
+                if(rank == 0)
+                    cout << "we have enough cards, set to different cards." << endl;
+                HIPCHECK(hipSetDevice(rank%8));
+            }
+        }
+
+#endif
+        cpuinstvec.resize(tlrmvmconfigvec.size());
+        for(int i=0; i<tlrmvmconfigvec.size(); i++)
+            cpuinstvec[i] = std::move(make_shared<TlrmvmCPU<HostType>>(tlrmvmconfigvec[i]));
+        init_alpha_beta(alpha, beta);
+        finalresults.resize(tlrmvmconfigvec.size() * tlrmvmconfigvec[0].originM);
+    }
+
+    template<typename HostType, typename DeviceType>
+    BatchTlrmvmhip<HostType, DeviceType>::BatchTlrmvmhip(){}
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::StreamInit(int streamsize){
+        this->stream_size = streamsize;
+        streamptr = new hipStream_t[streamsize];
+        cublashandleptr = new hipblasHandle_t[streamsize];
+        for(int i=0; i<streamsize; i++)
+            hipStreamCreateWithFlags(&streamptr[i], hipStreamNonBlocking);
+        for(int i=0; i<streamsize; i++)
+            hipblasCreate(&cublashandleptr[i]);
+        for(int i=0; i<streamsize; i++)
+            hipblasSetStream(cublashandleptr[i], streamptr[i]);
+        multigraph.StreamInit(batchsize, streamsize);
+        transposemultigraph.StreamInit(batchsize, streamsize);
+        singlegraph.StreamInit(streamsize);
+        transposesinglegraph.StreamInit(streamsize);
+    }
+
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::MemoryInit(){
+        cudap1ptrs_vec.resize(batchsize);
+        cudap3ptrs_vec.resize(batchsize);
+        cudap1transptrs_vec.resize(batchsize);
+        cudap3transptrs_vec.resize(batchsize);
+        auto start = std::chrono::steady_clock::now();
+#pragma omp parallel for default(none)
+        for(int bi=0; bi<batchsize; bi++){
+            cpuinstvec[bi]->MemoryInit();
+            PhasePointersCopyNonPointers<HostType, DeviceType>
+                    (cudap1ptrs_vec[bi],cpuinstvec[bi]->p1ptrs);
+            PhasePointersCopyNonPointers<HostType, DeviceType>
+                    (cudap3ptrs_vec[bi],cpuinstvec[bi]->p3ptrs);
+            PhasePointersCopyNonPointers<HostType, DeviceType>(
+                    cudap1transptrs_vec[bi],cpuinstvec[bi]->p1transptrs);
+            PhasePointersCopyNonPointers<HostType, DeviceType>(
+                    cudap3transptrs_vec[bi],cpuinstvec[bi]->p3transptrs);
+        }
+        auto end = std::chrono::steady_clock::now();
+        auto elapse_time = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
+#ifdef USE_MPI
+        int initflag;
+        MPI_Initialized(&initflag);
+        if(initflag == 1){
+            int rank;
+            int size;
+            MPI_Comm_rank(MPI_COMM_WORLD,&rank);
+            MPI_Comm_size(MPI_COMM_WORLD,&size);
+            MPI_Barrier(MPI_COMM_WORLD);
+            auto recv_buffer = elapse_time;
+            MPI_Allreduce(&elapse_time, &recv_buffer, 1,
+                          MPI_LONG_LONG, MPI_MAX, MPI_COMM_WORLD);
+            if(rank == 0) cout << "Reading data buffer takes time is " << recv_buffer * 1e-6 << " seconds."<< endl;
+        }
+#else
+        cout << "Reading data buffer takes time is " << elapse_time * 1e-6 << endl;
+#endif
+        Phase1Prepare();
+        Phase2Prepare();
+        Phase3Prepare();
+        // transpose
+        Phase1PrepareTranspose();
+        Phase2PrepareTranspose();
+        Phase3PrepareTranspose();
+    }
+
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType, DeviceType>::SetTransposeConjugate(bool transpose, bool conjugate){
+        this->transpose = transpose;
+        this->conjugate = conjugate;
+        for(auto &x : cpuinstvec) x->SetTransposeConjugate(transpose, conjugate);
+    }
+
+
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::setX(HostType * xvector, size_t xlength){
+        int offset = 0;
+        assert(xlength == config_vec[0].originN * batchsize);
+        for(int i=0; i<cpuinstvec.size(); i++){
+            cpuinstvec[i]->setX(xvector + offset , config_vec[i].originN);
+            offset += config_vec[i].originN;
+        }
+    }
+
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::TryConjugateXvec() {
+        for(int bi=0; bi<batchsize; bi++){
+            // no transpose logic
+            cpuinstvec[bi]->TryConjugateXvec();
+            CopyDataB2HD((HostType*)cudap1ptrs_vec[bi].x, cpuinstvec[bi]->p1ptrs.x, cpuinstvec[bi]->xmat.Shape()[0]);
+        }
+    }
+
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::Phase1(){
+        hipDeviceSynchronize();
+        for(int bi=0; bi < batchsize; bi++){
+            for(int i=0; i<config_vec[bi].Ntg; i++){
+                if(cudap1ptrs_vec[bi].Ms[i] != 0){
+                    hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N,
+                               cudap1ptrs_vec[bi].Ms[i], cudap1ptrs_vec[bi].Ks[i],
+                               &alpha, (const DeviceType*)cudap1ptrs_vec[bi].Abp[i], cudap1ptrs_vec[bi].Ms[i],
+                               (const DeviceType*)cudap1ptrs_vec[bi].xbp[i], 1, &beta,
+                               cudap1ptrs_vec[bi].ybp[i], 1);
+                }
+            }
+        }
+        hipDeviceSynchronize();
+    }
+
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::Phase1Transpose(){
+        hipDeviceSynchronize();
+        for(int bi=0; bi < batchsize; bi++){
+            for(int i=0; i<config_vec[bi].Ntg; i++){
+                if(cudap1transptrs_vec[bi].Ms[i] != 0){
+                    hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T,
+                               cudap1transptrs_vec[bi].Ks[i], cudap1transptrs_vec[bi].Ms[i],
+                               &alpha, (const DeviceType*)cudap1transptrs_vec[bi].Abp[i],
+                               cudap1transptrs_vec[bi].Ks[i],
+                               (const DeviceType*)cudap1transptrs_vec[bi].xbp[i], 1, &beta,
+                               cudap1transptrs_vec[bi].ybp[i], 1);
+                }
+            }
+        }
+        hipDeviceSynchronize();
+    }
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::Phase1Prepare() {
+
+        for(int bi=0; bi<batchsize; bi++){
+            int curbatch = cudap1ptrs_vec[bi].Ms.size();
+            GethipHostMemory(&cudap1ptrs_vec[bi].Abp, curbatch);
+            GethipHostMemory(&cudap1ptrs_vec[bi].xbp, curbatch);
+            GethipHostMemory(&cudap1ptrs_vec[bi].ybp, curbatch);
+        }
+        for(int bi=0; bi<batchsize; bi++){
+            GetDeviceMemory(&cudap1ptrs_vec[bi].A, cudap1ptrs_vec[bi].Acnt);
+            GetDeviceMemory(&cudap1ptrs_vec[bi].x, cudap1ptrs_vec[bi].Xcnt);
+            GetDeviceMemory(&cudap1ptrs_vec[bi].y, cudap1ptrs_vec[bi].Ycnt);
+            cudap1ptrs_vec[bi].Abp[0] = cudap1ptrs_vec[bi].A;
+            cudap1ptrs_vec[bi].xbp[0] = cudap1ptrs_vec[bi].x;
+            cudap1ptrs_vec[bi].ybp[0] = cudap1ptrs_vec[bi].y;
+        }
+        for(int bi=0; bi<batchsize; bi++){
+            auto AvMs = cudap1ptrs_vec[bi].Ms;
+            auto AvNs = cudap1ptrs_vec[bi].Ns;
+            auto AvKs = cudap1ptrs_vec[bi].Ks;
+            for(int i=1; i<config_vec[bi].Ntg; i++){
+                size_t AvMK = AvMs[i-1] * AvKs[i-1];
+                size_t AvKN = AvKs[i-1] * AvNs[i-1];
+                size_t AvMN = AvMs[i-1] * AvNs[i-1];
+                cudap1ptrs_vec[bi].Abp[i] =cudap1ptrs_vec[bi].Abp[i-1] + AvMK;
+                cudap1ptrs_vec[bi].xbp[i] = cudap1ptrs_vec[bi].xbp[i-1] + AvKN;
+                cudap1ptrs_vec[bi].ybp[i] = cudap1ptrs_vec[bi].ybp[i-1] + AvMN;
+            }
+            // load phase1 A,x to GPU
+            CopyDataB2HD((HostType*)cudap1ptrs_vec[bi].A, cpuinstvec[bi]->p1ptrs.A, cpuinstvec[bi]->p1ptrs.Acnt);
+            CopyDataB2HD((HostType*)cudap1ptrs_vec[bi].x, cpuinstvec[bi]->p1ptrs.x, cpuinstvec[bi]->p1ptrs.Xcnt);
+        }
+
+    }
+
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::Phase1PrepareTranspose() {
+        for(int bi=0; bi < batchsize; bi++){
+            int curbatch = cudap1ptrs_vec[bi].Ms.size();
+            GethipHostMemory(&cudap1transptrs_vec[bi].Abp, curbatch);
+            GethipHostMemory(&cudap1transptrs_vec[bi].xbp, curbatch);
+            GethipHostMemory(&cudap1transptrs_vec[bi].ybp, curbatch);
+        }
+        for(int bi=0; bi<batchsize; bi++){
+            cudap1transptrs_vec[bi].A = cudap3ptrs_vec[bi].A;
+            cudap1transptrs_vec[bi].x = cudap1ptrs_vec[bi].x;
+            GetDeviceMemory(&cudap1transptrs_vec[bi].y, cudap1transptrs_vec[bi].Ycnt);
+            cudap1transptrs_vec[bi].Abp[0] = cudap3ptrs_vec[bi].A; // use phase 3, U bases
+            cudap1transptrs_vec[bi].xbp[0] = cudap1ptrs_vec[bi].x; // use phase 1, x
+            cudap1transptrs_vec[bi].ybp[0] = cudap1transptrs_vec[bi].y; // create a new buffer
+        }
+        for(int bi=0; bi<batchsize; bi++){
+            for(int i=1; i<cudap1transptrs_vec[bi].Ms.size(); i++){
+                size_t AvMK = cudap1transptrs_vec[bi].Ms[i-1] * cudap1transptrs_vec[bi].Ks[i-1];
+                size_t AvKN = cudap1transptrs_vec[bi].Ks[i-1] * cudap1transptrs_vec[bi].Ns[i-1];
+                size_t AvMN = cudap1transptrs_vec[bi].Ms[i-1] * cudap1transptrs_vec[bi].Ns[i-1];
+                cudap1transptrs_vec[bi].Abp[i] = cudap1transptrs_vec[bi].Abp[i-1] + AvMK;
+                cudap1transptrs_vec[bi].xbp[i] = cudap1transptrs_vec[bi].xbp[i-1] + AvKN;
+                cudap1transptrs_vec[bi].ybp[i] = cudap1transptrs_vec[bi].ybp[i-1] + AvMN;
+            }
+        }
+    }
+
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::Phase2(){
+        hipDeviceSynchronize();
+        for(int bi=0; bi<batchsize; bi++){
+            phase2_nosplit<DeviceType>(cudap1ptrs_vec[bi].y, d_phase2mapping_vec[bi],
+                                       cudap3ptrs_vec[bi].x,
+                                       config_vec[bi].granksum, streamptr[bi % stream_size]);
+        }
+        hipDeviceSynchronize();
+    }
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::Phase2Transpose(){
+        hipDeviceSynchronize();
+        for(int bi=0; bi<batchsize; bi++){
+            phase2_nosplit<DeviceType>(cudap1transptrs_vec[bi].y, d_phase2mapping_transpose_vec[bi],
+                                       cudap3transptrs_vec[bi].x, config_vec[bi].granksum,
+                                       streamptr[bi%stream_size]);
+        }
+        hipDeviceSynchronize();
+    }
+
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::Phase2Prepare(){
+        d_phase2mapping_vec = new size_t*[batchsize];
+        for(int bi=0; bi<batchsize; bi++){
+            GetDeviceMemory(&d_phase2mapping_vec[bi], cpuinstvec[bi]->h_phase2mapping.size());
+            CopyDataB2HD(d_phase2mapping_vec[bi], cpuinstvec[bi]->h_phase2mapping.data(),
+                         cpuinstvec[bi]->h_phase2mapping.size());
+        }
+    }
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::Phase2PrepareTranspose(){
+        d_phase2mapping_transpose_vec = new size_t*[batchsize];
+        for(int bi=0; bi<batchsize; bi++){
+            GetDeviceMemory(&d_phase2mapping_transpose_vec[bi], cpuinstvec[bi]->h_phase2mappingTranspose.size());
+            CopyDataB2HD(d_phase2mapping_transpose_vec[bi], cpuinstvec[bi]->h_phase2mappingTranspose.data(),
+                         cpuinstvec[bi]->h_phase2mappingTranspose.size());
+        }
+    }
+
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::Phase3(){
+        hipDeviceSynchronize();
+        for(int bi=0; bi<batchsize; bi++){
+            for(int i=0; i<config_vec[bi].Mtg; i++){
+                hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N,
+                           cudap3ptrs_vec[bi].Ms[i], cudap3ptrs_vec[bi].Ks[i],
+                           &alpha, (const DeviceType*)cudap3ptrs_vec[bi].Abp[i], cudap3ptrs_vec[bi].Ms[i],
+                           (const DeviceType*)cudap3ptrs_vec[bi].xbp[i], 1, &beta,cudap3ptrs_vec[bi].ybp[i], 1);
+            }
+        }
+        hipDeviceSynchronize();
+    }
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::Phase3Transpose(){
+        hipDeviceSynchronize();
+        for(int bi=0; bi<batchsize; bi++){
+            for(int i=0; i<config_vec[bi].Mtg; i++){
+                hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T,
+                           cudap3transptrs_vec[bi].Ks[i], cudap3transptrs_vec[bi].Ms[i],
+                           &alpha, (const DeviceType*)cudap3transptrs_vec[bi].Abp[i],
+                           cudap3transptrs_vec[bi].Ks[i],
+                           (const DeviceType*)cudap3transptrs_vec[bi].xbp[i], 1,
+                           &beta,cudap3transptrs_vec[bi].ybp[i], 1);
+            }
+        }
+        hipDeviceSynchronize();
+    }
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::Phase3Prepare() {
+        for(int bi=0; bi<batchsize; bi++){
+            int curbatch = cudap3ptrs_vec[bi].Ms.size();
+            GethipHostMemory(&cudap3ptrs_vec[bi].Abp, curbatch);
+            GethipHostMemory(&cudap3ptrs_vec[bi].xbp, curbatch);
+            GethipHostMemory(&cudap3ptrs_vec[bi].ybp, curbatch);
+        }
+        for(int bi=0; bi<batchsize; bi++){
+            GetDeviceMemory(&cudap3ptrs_vec[bi].A, cudap3ptrs_vec[bi].Acnt);
+            GetDeviceMemory(&cudap3ptrs_vec[bi].x, cudap3ptrs_vec[bi].Xcnt);
+            GetDeviceMemory(&cudap3ptrs_vec[bi].y, cudap3ptrs_vec[bi].Ycnt);
+            cudap3ptrs_vec[bi].Abp[0] = cudap3ptrs_vec[bi].A;
+            cudap3ptrs_vec[bi].xbp[0] = cudap3ptrs_vec[bi].x;
+            cudap3ptrs_vec[bi].ybp[0] = cudap3ptrs_vec[bi].y;
+        }
+        for(int bi=0; bi<batchsize; bi++){
+            auto AuMs = cudap3ptrs_vec[bi].Ms;
+            auto AuNs = cudap3ptrs_vec[bi].Ns;
+            auto AuKs = cudap3ptrs_vec[bi].Ks;
+            for(int i=1; i<cpuinstvec[bi]->config.Mtg; i++){
+                size_t AuMK = AuMs[i-1] * AuKs[i-1];
+                size_t AuKN = AuKs[i-1] * AuNs[i-1];
+                size_t AuMN = AuMs[i-1] * AuNs[i-1];
+                cudap3ptrs_vec[bi].Abp[i] = cudap3ptrs_vec[bi].Abp[i-1] + AuMK;
+                cudap3ptrs_vec[bi].xbp[i] = cudap3ptrs_vec[bi].xbp[i-1] + AuKN;
+                cudap3ptrs_vec[bi].ybp[i] = cudap3ptrs_vec[bi].ybp[i-1] + AuMN;
+            }
+            // load phase 3 A to GPU
+            CopyDataB2HD((HostType*)cudap3ptrs_vec[bi].A, cpuinstvec[bi]->p3ptrs.A, cudap3ptrs_vec[bi].Acnt);
+        }
+    }
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::Phase3PrepareTranspose() {
+        for(int bi=0; bi<batchsize; bi++){
+            int curbatch = cudap3transptrs_vec[bi].Ms.size();
+            GethipHostMemory(&cudap3transptrs_vec[bi].Abp, curbatch);
+            GethipHostMemory(&cudap3transptrs_vec[bi].xbp, curbatch);
+            GethipHostMemory(&cudap3transptrs_vec[bi].ybp, curbatch);
+        }
+        for(int bi=0; bi<batchsize; bi++){
+            cudap3transptrs_vec[bi].A = cudap1ptrs_vec[bi].A;
+            cudap3transptrs_vec[bi].x = cudap3ptrs_vec[bi].x;
+            GetDeviceMemory(&cudap3transptrs_vec[bi].y, cudap3transptrs_vec[bi].Ycnt);
+            cudap3transptrs_vec[bi].Abp[0] = cudap1ptrs_vec[bi].A; // use phase 1, V bases
+            cudap3transptrs_vec[bi].xbp[0] = cudap3ptrs_vec[bi].x; // use phase 3, x
+            cudap3transptrs_vec[bi].ybp[0] = cudap3transptrs_vec[bi].y; // create a new buffer
+        }
+        for(int bi=0; bi<batchsize; bi++){
+            for(int i=1; i<cudap3transptrs_vec[bi].Ms.size(); i++){
+                size_t AvMK = cudap3transptrs_vec[bi].Ms[i-1] * cudap3transptrs_vec[bi].Ks[i-1];
+                size_t AvKN = cudap3transptrs_vec[bi].Ks[i-1] * cudap3transptrs_vec[bi].Ns[i-1];
+                size_t AvMN = cudap3transptrs_vec[bi].Ms[i-1] * cudap3transptrs_vec[bi].Ns[i-1];
+                cudap3transptrs_vec[bi].Abp[i] = cudap3transptrs_vec[bi].Abp[i-1] + AvMK;
+                cudap3transptrs_vec[bi].xbp[i] = cudap3transptrs_vec[bi].xbp[i-1] + AvKN;
+                cudap3transptrs_vec[bi].ybp[i] = cudap3transptrs_vec[bi].ybp[i-1] + AvMN;
+            }
+            // no need to copy data.
+        }
+    }
+
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::MVM_MultiGraph(){
+        if(transpose){
+            MVM_MultiGraphTranspose();
+        }else{
+            MVM_MultiGraphNoTranspose();
+        }
+    }
+
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::MVM_MultiGraphTranspose(){
+        auto & graphCreated = transposemultigraph.graphCreated;
+        auto & event_start = transposemultigraph.event_start;
+        auto & events = transposemultigraph.events;
+        auto & graph = transposemultigraph.graph;
+        auto & instance = transposemultigraph.instance;
+        for(int bi=0; bi<batchsize; bi++){
+            if(!graphCreated[bi]){
+                hipStreamBeginCapture(streamptr[0],hipStreamCaptureModeGlobal);
+                hipEventRecord(event_start[bi], streamptr[0]);
+                for(int streami=1; streami<stream_size; streami++){
+                    hipStreamWaitEvent(streamptr[streami], event_start[bi],0);
+                }
+                // phase 1 transpose
+                for(int i=0; i<config_vec[bi].Ntg; i++){
+                    if(cudap1transptrs_vec[bi].Ms[i] != 0){
+                        hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T,
+                                   cudap1transptrs_vec[bi].Ks[i], cudap1transptrs_vec[bi].Ms[i],
+                                   &alpha, (const DeviceType*)cudap1transptrs_vec[bi].Abp[i],
+                                   cudap1transptrs_vec[bi].Ks[i],
+                                   (const DeviceType*)cudap1transptrs_vec[bi].xbp[i], 1, &beta,
+                                   cudap1transptrs_vec[bi].ybp[i], 1);
+                    }
+                }
+                for(int streami=1; streami < stream_size; streami++){
+                    hipEventRecord(events[bi][streami], streamptr[streami]);
+                }
+                for(int streami=1; streami < stream_size; streami++){
+                    hipStreamWaitEvent(streamptr[0], events[bi][streami],0);
+                }
+                // phase 2 transpose
+                phase2_nosplit<DeviceType>(cudap1transptrs_vec[bi].y,
+                                           d_phase2mapping_transpose_vec[bi],
+                                           cudap3transptrs_vec[bi].x,
+                                           config_vec[bi].granksum, streamptr[0]);
+                hipEventRecord(events[bi][0], streamptr[0]);
+                for(int streami=1; streami < stream_size; streami++){
+                    hipStreamWaitEvent(streamptr[streami], events[bi][0],0);
+                }
+                // phase 3 transpose
+                for(int i=0; i<config_vec[bi].Mtg; i++){
+                    hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T,
+                               cudap3transptrs_vec[bi].Ks[i], cudap3transptrs_vec[bi].Ms[i],
+                               &alpha, (const DeviceType*)cudap3transptrs_vec[bi].Abp[i],
+                               cudap3transptrs_vec[bi].Ks[i],
+                               (const DeviceType*)cudap3transptrs_vec[bi].xbp[i],
+                               1, &beta,cudap3transptrs_vec[bi].ybp[i], 1);
+                }
+                // final merge
+                for(int streami=1; streami < stream_size; streami++){
+                    hipEventRecord(events[bi][stream_size + streami], streamptr[streami]);
+                }
+                for(int streami=1; streami < stream_size; streami++){
+                    hipStreamWaitEvent(streamptr[0], events[bi][stream_size + streami],0);
+                }
+                hipStreamEndCapture(streamptr[0], &graph[bi]);
+                hipGraphInstantiate(&instance[bi], graph[bi], nullptr, nullptr, 0);
+                graphCreated[bi] = true;
+            }
+            hipGraphLaunch(instance[bi], streamptr[0]);
+            hipStreamSynchronize(streamptr[0]);
+        }
+    }
+
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::MVM_MultiGraphNoTranspose()
+    {
+        auto & graphCreated = multigraph.graphCreated;
+        auto & event_start = multigraph.event_start;
+        auto & events = multigraph.events;
+        auto & graph = multigraph.graph;
+        auto & instance = multigraph.instance;
+        for(int bi=0; bi<batchsize; bi++){
+            if(!graphCreated[bi]){
+                hipStreamBeginCapture(streamptr[0],hipStreamCaptureModeGlobal);
+                hipEventRecord(event_start[bi], streamptr[0]);
+                for(int streami=1; streami<stream_size; streami++){
+                    hipStreamWaitEvent(streamptr[streami], event_start[bi],0);
+                }
+                // phase 1
+                for(int i=0; i<config_vec[bi].Ntg; i++){
+                    if(cudap1ptrs_vec[bi].Ms[i] != 0){
+                        hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N,
+                                   cudap1ptrs_vec[bi].Ms[i], cudap1ptrs_vec[bi].Ks[i],
+                                   &alpha, (const DeviceType*)cudap1ptrs_vec[bi].Abp[i], cudap1ptrs_vec[bi].Ms[i],
+                                   (const DeviceType*)cudap1ptrs_vec[bi].xbp[i], 1, &beta,
+                                   cudap1ptrs_vec[bi].ybp[i], 1);
+                    }
+                }
+                for(int streami=1; streami < stream_size; streami++){
+                    hipEventRecord(events[bi][streami], streamptr[streami]);
+                }
+                for(int streami=1; streami < stream_size; streami++){
+                    hipStreamWaitEvent(streamptr[0], events[bi][streami],0);
+                }
+                // phase 2
+                phase2_nosplit<DeviceType>(cudap1ptrs_vec[bi].y,
+                                           d_phase2mapping_vec[bi], cudap3ptrs_vec[bi].x,
+                                           config_vec[bi].granksum, streamptr[0]);
+                hipEventRecord(events[bi][0], streamptr[0]);
+                for(int streami=1; streami < stream_size; streami++){
+                    hipStreamWaitEvent(streamptr[streami], events[bi][0],0);
+                }
+                // phase 3
+                for(int i=0; i<config_vec[bi].Mtg; i++){
+                    hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N,
+                               cudap3ptrs_vec[bi].Ms[i], cudap3ptrs_vec[bi].Ks[i],
+                               &alpha, (const DeviceType*)cudap3ptrs_vec[bi].Abp[i], cudap3ptrs_vec[bi].Ms[i],
+                               (const DeviceType*)cudap3ptrs_vec[bi].xbp[i], 1, &beta,cudap3ptrs_vec[bi].ybp[i], 1);
+                }
+                // final merge
+                for(int streami=1; streami < stream_size; streami++){
+                    hipEventRecord(events[bi][stream_size + streami], streamptr[streami]);
+                }
+                for(int streami=1; streami < stream_size; streami++){
+                    hipStreamWaitEvent(streamptr[0], events[bi][stream_size + streami],0);
+                }
+                hipStreamEndCapture(streamptr[0], &graph[bi]);
+                hipGraphInstantiate(&instance[bi],
+                                     graph[bi], nullptr, nullptr, 0);
+                graphCreated[bi] = true;
+            }
+            hipGraphLaunch(instance[bi], streamptr[0]);
+            hipStreamSynchronize(streamptr[0]);
+        }
+    }
+
+
+
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::MVM_SingleGraph()
+    {
+        if(transpose){
+            MVM_MultiGraphTranspose();
+        }else{
+            MVM_MultiGraphNoTranspose();
+        }
+    }
+
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::MVM_SingleGraphTranspose()
+    {
+        auto & graphCreated = transposesinglegraph.graphCreated;
+        auto & event_start = transposesinglegraph.event_start;
+        auto & events = transposesinglegraph.events;
+        auto & graph = transposesinglegraph.graph;
+        auto & instance = transposesinglegraph.instance;
+        if(!graphCreated){
+            hipStreamBeginCapture(streamptr[0],hipStreamCaptureModeGlobal);
+            transposesinglegraph.syncotherstreams(event_start, streamptr, stream_size);
+            for(int bi=0; bi<batchsize; bi++){
+                // phase 1 transpose
+                for(int i=0; i<config_vec[bi].Ntg; i++){
+                    if(cudap1transptrs_vec[bi].Ms[i] != 0){
+                        hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T,
+                                   cudap1transptrs_vec[bi].Ks[i], cudap1transptrs_vec[bi].Ms[i],
+                                   &alpha, (const DeviceType*)cudap1transptrs_vec[bi].Abp[i],
+                                   cudap1transptrs_vec[bi].Ks[i],
+                                   (const DeviceType*)cudap1transptrs_vec[bi].xbp[i], 1, &beta,
+                                   cudap1transptrs_vec[bi].ybp[i], 1);
+                    }
+                }
+            }
+            // phase 1 synchronization
+            transposesinglegraph.syncallstreams(events, streamptr, stream_size);
+            // phase 2 transpose
+            for(int bi=0; bi<batchsize; bi++){
+                phase2_nosplit<DeviceType>(cudap1transptrs_vec[bi].y,
+                                           d_phase2mapping_transpose_vec[bi],
+                                           cudap3transptrs_vec[bi].x,
+                                           config_vec[bi].granksum, streamptr[0]);
+            }
+            // phase 2 synchronization
+            transposesinglegraph.syncallstreams(events+1*stream_size, streamptr, stream_size);
+            for(int bi=0; bi<batchsize; bi++){
+                // phase 3 transpose
+                for(int i=0; i<config_vec[bi].Mtg; i++){
+                    hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T,
+                               cudap3transptrs_vec[bi].Ks[i], cudap3transptrs_vec[bi].Ms[i],
+                               &alpha, (const DeviceType*)cudap3transptrs_vec[bi].Abp[i],
+                               cudap3transptrs_vec[bi].Ks[i],
+                               (const DeviceType*)cudap3transptrs_vec[bi].xbp[i],
+                               1, &beta,cudap3transptrs_vec[bi].ybp[i], 1);
+                }
+            }
+            // final merge
+            transposesinglegraph.syncstream0(events+2*stream_size, streamptr, stream_size);
+            hipStreamEndCapture(streamptr[0], &graph);
+            hipGraphInstantiate(&instance, graph,
+                                 nullptr, nullptr, 0);
+            graphCreated = true;
+        }
+        hipGraphLaunch(instance, streamptr[0]);
+        hipStreamSynchronize(streamptr[0]);
+    }
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::MVM_SingleGraphNoTranspose()
+    {
+        auto & graphCreated = singlegraph.graphCreated;
+        auto & event_start = singlegraph.event_start;
+        auto & events = singlegraph.events;
+        auto & graph = singlegraph.graph;
+        auto & instance = singlegraph.instance;
+
+        if(!graphCreated){
+            hipStreamBeginCapture(streamptr[0],hipStreamCaptureModeGlobal);
+            singlegraph.syncotherstreams(event_start, streamptr, stream_size);
+            // phase 1
+            for(int bi=0; bi<batchsize; bi++){
+                for(int i=0; i<config_vec[bi].Ntg; i++){
+                    if(cudap1ptrs_vec[bi].Ms[i] != 0){
+                        hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N,
+                                   cudap1ptrs_vec[bi].Ms[i], cudap1ptrs_vec[bi].Ks[i],
+                                   &alpha, (const DeviceType*)cudap1ptrs_vec[bi].Abp[i], cudap1ptrs_vec[bi].Ms[i],
+                                   (const DeviceType*)cudap1ptrs_vec[bi].xbp[i], 1, &beta,
+                                   cudap1ptrs_vec[bi].ybp[i], 1);
+                    }
+                }
+            }
+            // phase 1 synchronization
+            singlegraph.syncallstreams(events, streamptr, stream_size);
+            // phase 2
+            for(int bi=0; bi<batchsize; bi++){
+                phase2_nosplit<DeviceType>(cudap1ptrs_vec[bi].y,
+                                           d_phase2mapping_vec[bi], cudap3ptrs_vec[bi].x,
+                                           config_vec[bi].granksum, streamptr[bi%stream_size]);
+            }
+            // phase 2 synchronization
+            singlegraph.syncallstreams(events+1*stream_size, streamptr, stream_size);
+            // phase 3
+            for(int bi=0; bi<batchsize; bi++){
+                for(int i=0; i<config_vec[bi].Mtg; i++){
+                    hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N,
+                               cudap3ptrs_vec[bi].Ms[i], cudap3ptrs_vec[bi].Ks[i],
+                               &alpha, (const DeviceType*)cudap3ptrs_vec[bi].Abp[i], cudap3ptrs_vec[bi].Ms[i],
+                               (const DeviceType*)cudap3ptrs_vec[bi].xbp[i], 1,
+                               &beta,cudap3ptrs_vec[bi].ybp[i], 1);
+                }
+            }
+            // final merge
+            singlegraph.syncstream0(events+2*stream_size, streamptr, stream_size);
+            hipStreamEndCapture(streamptr[0], &graph);
+            hipGraphInstantiate(&instance, graph,
+                                 nullptr, nullptr, 0);
+            graphCreated = true;
+        }
+        hipGraphLaunch(instance, streamptr[0]);
+        hipStreamSynchronize(streamptr[0]);
+    }
+
+
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::TryConjugateResults() {
+//        if(!conjugate) return;
+        if(transpose){
+            for(int bi=0; bi<config_vec.size(); bi++){
+                if(conjugate) ConjugateDriver<DeviceType>(cudap3transptrs_vec[bi].y,config_vec[bi].originN, streamptr[0]);
+                CopyDataB2HD(cpuinstvec[bi]->p3transptrs.y, (HostType*)cudap3transptrs_vec[bi].y,cpuinstvec[bi]->config.originM);
+            }
+        }else{
+            for(int bi=0; bi<config_vec.size(); bi++){
+                if(conjugate) ConjugateDriver<DeviceType>(cudap3ptrs_vec[bi].y,config_vec[bi].originM,streamptr[0]);
+                CopyDataB2HD(cpuinstvec[bi]->p3ptrs.y, (HostType*)cudap3ptrs_vec[bi].y,cpuinstvec[bi]->config.originM);
+            }
+        }
+    }
+
+
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType, DeviceType>::CopyBackResults()
+    {
+        size_t offset = 0, origin = 0;
+        for(int bi=0; bi<batchsize; bi++){
+            // use cpu pointers to send output
+            if(transpose){
+                CopyDataB2HD(cpuinstvec[bi]->p1transptrs.y, (HostType*)cudap1transptrs_vec[bi].y, cpuinstvec[bi]->config.granksum);
+                CopyDataB2HD(cpuinstvec[bi]->p3transptrs.x, (HostType*)cudap3transptrs_vec[bi].x, cpuinstvec[bi]->config.granksum);
+                CopyDataB2HD(cpuinstvec[bi]->p3transptrs.y, (HostType*)cudap3transptrs_vec[bi].y, cpuinstvec[bi]->config.originM);
+                origin = cpuinstvec[bi]->config.originM;
+//                memcpy(cpuinstvec[bi]->p3transptrs.y, &alpha, sizeof(HostType));
+                memcpy(finalresults.data() + offset,cpuinstvec[bi]->p3transptrs.y, sizeof(HostType) * origin);
+                offset += cpuinstvec[bi]->config.originM;
+            }else{
+                CopyDataB2HD(cpuinstvec[bi]->p1ptrs.y, (HostType*)cudap1ptrs_vec[bi].y, cpuinstvec[bi]->config.granksum);
+                CopyDataB2HD(cpuinstvec[bi]->p3ptrs.x, (HostType*)cudap3ptrs_vec[bi].x, cpuinstvec[bi]->config.granksum);
+                CopyDataB2HD(cpuinstvec[bi]->p3ptrs.y, (HostType*)cudap3ptrs_vec[bi].y, cpuinstvec[bi]->config.originM);
+                origin = cpuinstvec[bi]->config.originM;
+                memcpy(finalresults.data() + offset,cpuinstvec[bi]->p3ptrs.y, sizeof(HostType) * origin);
+                offset += cpuinstvec[bi]->config.originM;
+            }
+            cpuinstvec[bi]->CopyToFinalresults();
+        }
+    }
+
+    template<typename HostType, typename DeviceType>
+    void BatchTlrmvmhip<HostType,DeviceType>::MemoryFree(){
+        for(int bi=0; bi<batchsize; bi++){
+            cpuinstvec[bi]->MemoryFree();
+            FreehipHostMemory(cudap1ptrs_vec[bi].Abp);
+            FreehipHostMemory(cudap1ptrs_vec[bi].xbp);
+            FreehipHostMemory(cudap1ptrs_vec[bi].ybp);
+            FreeDeviceMemory(cudap1ptrs_vec[bi].A);
+            FreeDeviceMemory(cudap1ptrs_vec[bi].x);
+            FreeDeviceMemory(cudap1ptrs_vec[bi].y);
+
+            FreehipHostMemory(cudap3ptrs_vec[bi].Abp);
+            FreehipHostMemory(cudap3ptrs_vec[bi].xbp);
+            FreehipHostMemory(cudap3ptrs_vec[bi].ybp);
+            FreeDeviceMemory(cudap3ptrs_vec[bi].A);
+            FreeDeviceMemory(cudap3ptrs_vec[bi].x);
+            FreeDeviceMemory(cudap3ptrs_vec[bi].y);
+
+            FreehipHostMemory(cudap1transptrs_vec[bi].Abp);
+            FreehipHostMemory(cudap1transptrs_vec[bi].xbp);
+            FreehipHostMemory(cudap1transptrs_vec[bi].ybp);
+            FreeDeviceMemory(cudap1transptrs_vec[bi].y);
+
+            FreehipHostMemory(cudap3transptrs_vec[bi].Abp);
+            FreehipHostMemory(cudap3transptrs_vec[bi].xbp);
+            FreehipHostMemory(cudap3transptrs_vec[bi].ybp);
+            FreeDeviceMemory(cudap3transptrs_vec[bi].y);
+        }
+    }
+
+    template class BatchTlrmvmhip<float, float>;
+    template class BatchTlrmvmhip<double, double>;
+    template class BatchTlrmvmhip<complex<float>, hipComplex>;
+    template class BatchTlrmvmhip<complex<double>, hipDoubleComplex>;
+
+
+}
\ No newline at end of file
diff --git a/src/tlrmvm/hip/BatchTlrmvmhip.hpp b/src/tlrmvm/hip/BatchTlrmvmhip.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..bb2bd281cbcb68bf3c1b5bc57143557008df2a67
--- /dev/null
+++ b/src/tlrmvm/hip/BatchTlrmvmhip.hpp
@@ -0,0 +1,84 @@
+#pragma once
+
+#include <vector>
+using std::vector;
+
+
+#include "../cpu/TlrmvmCPU.hpp"
+#include "Tlrmvmhip.hpp"
+#include "tlrmvmhiputil.hpp"
+#include <memory>
+
+#ifdef USE_MPI
+#include <mpi.h>
+#endif
+
+namespace hiptlrmvm
+{
+    template<typename HostType, typename DeviceType>
+    class BatchTlrmvmhip
+    {
+    public:
+        explicit BatchTlrmvmhip(vector<TlrmvmConfig> tlrmvmconfigvec);
+        BatchTlrmvmhip();
+        void StreamInit(int streamsize);
+        void StreamDestroy();
+        void MemoryInit();
+        void MemoryFree();
+        void Phase1();
+        void Phase1Transpose();
+        void Phase1Prepare();
+        void Phase1PrepareTranspose();
+        void Phase2();
+        void Phase2Transpose();
+        void Phase2Prepare();
+        void Phase2PrepareTranspose();
+        void Phase3();
+        void Phase3Transpose();
+        void Phase3Prepare();
+        void Phase3PrepareTranspose();
+        void MVM_SingleGraph();
+        void MVM_SingleGraphTranspose();
+        void MVM_SingleGraphNoTranspose();
+        void MVM_MultiGraph();
+        void MVM_MultiGraphTranspose();
+        void MVM_MultiGraphNoTranspose();
+
+        // seperate 2 functions.
+        void SetTransposeConjugate(bool transpose, bool conjugate);
+        void setX(HostType * xvector, size_t xlength);
+        void TryConjugateXvec();
+        void TryConjugateResults();
+        void CopyBackResults();
+
+        bool transpose;
+        bool conjugate;
+
+        int batchsize;
+        // cpu instance
+        vector<TlrmvmConfig> config_vec;
+        vector<std::shared_ptr<TlrmvmCPU<HostType>>> cpuinstvec;
+
+        // GPU resources
+        hipStream_t * streamptr;
+        hipblasHandle_t * cublashandleptr;
+        int stream_size;
+
+        MultiGraph multigraph;
+        MultiGraph transposemultigraph;
+        SingleGraph singlegraph;
+        SingleGraph transposesinglegraph;
+
+        DeviceType alpha;
+        DeviceType beta;
+        // gpu pointers
+        vector<HIPPhasePointers<DeviceType>> cudap1ptrs_vec;
+        vector<HIPPhasePointers<DeviceType>> cudap1transptrs_vec;
+        size_t * *d_phase2mapping_vec;
+        size_t * *d_phase2mapping_transpose_vec;
+        vector<HIPPhasePointers<DeviceType>> cudap3ptrs_vec;
+        vector<HIPPhasePointers<DeviceType>> cudap3transptrs_vec;
+        vector<HostType> finalresults;
+    };
+}
+
diff --git a/src/tlrmvm/hip/Tlrmvmhip.cpp b/src/tlrmvm/hip/Tlrmvmhip.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..12320c1a449863c87de1a4126a0b1fb5a1a2ab45
--- /dev/null
+++ b/src/tlrmvm/hip/Tlrmvmhip.cpp
@@ -0,0 +1,607 @@
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+
+#include "../../common/Common.hpp"
+#include "../../common/AppUtil.hpp"
+#include "../cpu/TlrmvmCPU.hpp"
+#include "Tlrmvmhip.hpp"
+#include "hipkernel.cuh"
+
+namespace hiptlrmvm
+{
+
+    template<typename T>
+    HIPPhasePointers<T>::HIPPhasePointers(){}
+
+    template struct HIPPhasePointers<float>;
+    template struct HIPPhasePointers<double>;
+    template struct HIPPhasePointers<hipComplex>;
+    template struct HIPPhasePointers<hipDoubleComplex>;
+
+    template<typename SrcType, typename DestType>
+    void PhasePointersCopyNonPointers(HIPPhasePointers<DestType> &dest, const PhasePointers<SrcType> &src){
+        dest.Acnt = src.Acnt;
+        dest.Xcnt = src.Xcnt;
+        dest.Ycnt = src.Ycnt;
+        dest.Ms = src.Ms;
+        dest.Ks = src.Ks;
+        dest.Ns = src.Ns;
+    }
+
+    template void PhasePointersCopyNonPointers<float,float>(HIPPhasePointers<float> &,
+                                                            const PhasePointers<float>&);
+    template void PhasePointersCopyNonPointers<double,double>(HIPPhasePointers<double> &,
+                                                              const PhasePointers<double>&);
+    template void PhasePointersCopyNonPointers<complex<float>,hipComplex>
+            (HIPPhasePointers<hipComplex> &, const PhasePointers<complex<float>>&);
+    template void PhasePointersCopyNonPointers<complex<double>,hipDoubleComplex>
+            (HIPPhasePointers<hipDoubleComplex> &, const PhasePointers<complex<double>>&);
+
+    template<typename HostType, typename DeviceType>
+    Tlrmvmhip<HostType, DeviceType>::Tlrmvmhip() {}
+
+    template<typename HostType, typename DeviceType>
+    Tlrmvmhip<HostType, DeviceType>::Tlrmvmhip(TlrmvmConfig tlrmvmconfig)
+            :config(tlrmvmconfig)
+    {
+        transpose = false;
+        conjugate = false;
+        init_alpha_beta(alpha, beta);
+        tlrmvmcpu = std::make_shared<TlrmvmCPU<HostType>>(tlrmvmconfig);
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType, DeviceType>::UpdateConfig(TlrmvmConfig &tlrmvmconfig)
+    {
+//        transpose = false;
+//        conjugate = false;
+//        init_alpha_beta(alpha, beta);
+//        tlrmvmcpu->UpdateConfig(tlrmvmconfig);
+        cout << "UpdateConfig not implemented." << endl;
+        exit(0);
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::setX(HostType * xvector, size_t xlength){
+        tlrmvmcpu->setX(xvector, xlength);
+        tlrmvmcpu->TryConjugateXvec();
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::TryConjugateXvec() {
+        // no transpose logic
+        tlrmvmcpu->TryConjugateXvec();
+        CopyDataB2HD((HostType*)this->cudap1ptrs.x, tlrmvmcpu->p1ptrs.x, tlrmvmcpu->xmat.Shape()[0]);
+    }
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::TryConjugateResults() {
+        if(!conjugate) return;
+        if(transpose){
+            ConjugateDriver<DeviceType>(cudap3transptrs.y, config.originN, streamptr[0]);
+            CopyDataB2HD(tlrmvmcpu->p3transptrs.y, (HostType*)cudap3transptrs.y, tlrmvmcpu->config.originM);
+        }else{
+            ConjugateDriver<DeviceType>(cudap3ptrs.y, config.originM, streamptr[0]);
+            CopyDataB2HD(tlrmvmcpu->p3ptrs.y, (HostType*)cudap3ptrs.y, tlrmvmcpu->config.originM);
+        }
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::StreamInit(int streamsize){
+        this->stream_size = streamsize;
+        streamptr = new hipStream_t[streamsize];
+        cublashandleptr = new hipblasHandle_t[streamsize];
+        for(int i=0; i<streamsize; i++)
+            hipStreamCreateWithFlags(&streamptr[i], hipStreamNonBlocking);
+        for(int i=0; i<streamsize; i++)
+            hipblasCreate(&cublashandleptr[i]);
+        for(int i=0; i<streamsize; i++)
+            hipblasSetStream(cublashandleptr[i], streamptr[i]);
+        HIPCHECK(hipEventCreate(&event_start));
+        HIPCHECK(hipEventCreate(&event_phase2finish));
+        events = new hipEvent_t[2*streamsize];
+        for(int i=0; i<2*streamsize; i++) HIPCHECK(hipEventCreate(&events[i]));
+        // graph
+        graphCreated = false;
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::StreamDestroy(){
+        for(int i=0; i<stream_size; i++) hipblasDestroy(cublashandleptr[i]);
+        for(int i=0; i<stream_size; i++) hipStreamDestroy(streamptr[i]);
+        delete[] cublashandleptr;
+        delete[] streamptr;
+        HIPCHECK(hipEventDestroy(event_start));
+        HIPCHECK(hipEventDestroy(event_phase2finish));
+        for(int i=0; i<2*stream_size; i++) HIPCHECK(hipEventDestroy(events[i]));
+        delete[] events;
+        // graph
+        if(graphCreated){
+            hipGraphExecDestroy(instance);
+            hipGraphDestroy(graph);
+        }
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::MemoryInit(){
+        tlrmvmcpu->MemoryInit();
+        PhasePointersCopyNonPointers<HostType, DeviceType>(cudap1ptrs, tlrmvmcpu->p1ptrs);
+        PhasePointersCopyNonPointers<HostType, DeviceType>(cudap3ptrs, tlrmvmcpu->p3ptrs);
+        PhasePointersCopyNonPointers<HostType, DeviceType>(cudap1transptrs, tlrmvmcpu->p1transptrs);
+        PhasePointersCopyNonPointers<HostType, DeviceType>(cudap3transptrs, tlrmvmcpu->p3transptrs);
+        Phase1GetMembuffer();
+        AllocatePhase1Buffer();
+        Phase1CopyData();
+        Phase2Prepare();
+        Phase3GetMembuffer();
+        AllocatePhase3Buffer();
+        Phase3CopyData();
+        // transpose
+        Phase1GetMembufferTranspose();
+        AllocatePhase1BufferTranspose();
+        Phase1CopyDataTranspose();
+        Phase2PrepareTranspose();
+        Phase3GetMembufferTranspose();
+        AllocatePhase3BufferTranspose();
+        Phase3CopyDataTranspose();
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::MemoryFree(){
+        tlrmvmcpu->MemoryFree();
+        FreehipHostMemory(cudap1ptrs.Abp);
+        FreehipHostMemory(cudap1ptrs.xbp);
+        FreehipHostMemory(cudap1ptrs.ybp);
+        FreeDeviceMemory(cudap1ptrs.A);
+        FreeDeviceMemory(cudap1ptrs.x);
+        FreeDeviceMemory(cudap1ptrs.y);
+
+        FreehipHostMemory(cudap3ptrs.Abp);
+        FreehipHostMemory(cudap3ptrs.xbp);
+        FreehipHostMemory(cudap3ptrs.ybp);
+        FreeDeviceMemory(cudap3ptrs.A);
+        FreeDeviceMemory(cudap3ptrs.x);
+        FreeDeviceMemory(cudap3ptrs.y);
+
+        FreehipHostMemory(cudap1transptrs.Abp);
+        FreehipHostMemory(cudap1transptrs.xbp);
+        FreehipHostMemory(cudap1transptrs.ybp);
+        FreeDeviceMemory(cudap1transptrs.y);
+
+        FreehipHostMemory(cudap3transptrs.Abp);
+        FreehipHostMemory(cudap3transptrs.xbp);
+        FreehipHostMemory(cudap3transptrs.ybp);
+        FreeDeviceMemory(cudap3transptrs.y);
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::Phase1(){
+        hipDeviceSynchronize();
+        for(int i=0; i<config.Ntg; i++){
+            if(cudap1ptrs.Ms[i] != 0){
+                hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N,
+                           cudap1ptrs.Ms[i], cudap1ptrs.Ks[i],
+                           &alpha, (const DeviceType*)cudap1ptrs.Abp[i], cudap1ptrs.Ms[i],
+                           (const DeviceType*)cudap1ptrs.xbp[i], 1, &beta,
+                           cudap1ptrs.ybp[i], 1);
+            }
+        }
+        hipDeviceSynchronize();
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::Phase1Transpose(){
+        hipDeviceSynchronize();
+        for(int i=0; i<config.Ntg; i++){
+            if(cudap1transptrs.Ms[i] != 0){
+                hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T,
+                           cudap1transptrs.Ks[i], cudap1transptrs.Ms[i],
+                           &alpha, (const DeviceType*)cudap1transptrs.Abp[i],
+                           cudap1transptrs.Ks[i],
+                           (const DeviceType*)cudap1transptrs.xbp[i], 1, &beta,
+                           cudap1transptrs.ybp[i], 1);
+            }
+        }
+        hipDeviceSynchronize();
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::Phase1GetMembuffer(){
+        int batchsize = cudap1ptrs.Ms.size();
+        GethipHostMemory(&cudap1ptrs.Abp, batchsize);
+        GethipHostMemory(&cudap1ptrs.xbp, batchsize);
+        GethipHostMemory(&cudap1ptrs.ybp, batchsize);
+    }
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::Phase1GetMembufferTranspose()
+    {
+        int batchsize = cudap1ptrs.Ms.size();
+        GethipHostMemory(&cudap1transptrs.Abp, batchsize);
+        GethipHostMemory(&cudap1transptrs.xbp, batchsize);
+        GethipHostMemory(&cudap1transptrs.ybp, batchsize);
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::AllocatePhase1Buffer(){
+        GetDeviceMemory(&cudap1ptrs.A, cudap1ptrs.Acnt);
+        GetDeviceMemory(&cudap1ptrs.x, cudap1ptrs.Xcnt);
+        GetDeviceMemory(&cudap1ptrs.y, cudap1ptrs.Ycnt);
+        cudap1ptrs.Abp[0] = cudap1ptrs.A;
+        cudap1ptrs.xbp[0] = cudap1ptrs.x;
+        cudap1ptrs.ybp[0] = cudap1ptrs.y;
+    }
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::AllocatePhase1BufferTranspose(){
+        cudap1transptrs.A = cudap3ptrs.A;
+        cudap1transptrs.x = cudap1ptrs.x;
+        GetDeviceMemory(&cudap1transptrs.y, cudap1transptrs.Ycnt);
+        cudap1transptrs.Abp[0] = cudap3ptrs.A; // use phase 3, U bases
+        cudap1transptrs.xbp[0] = cudap1ptrs.x; // use phase 1, x
+        cudap1transptrs.ybp[0] = cudap1transptrs.y; // create a new buffer
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::Phase1CopyData(){
+        auto AvMs = cudap1ptrs.Ms;
+        auto AvNs = cudap1ptrs.Ns;
+        auto AvKs = cudap1ptrs.Ks;
+        for(int i=1; i<config.Ntg; i++){
+            size_t AvMK = AvMs[i-1] * AvKs[i-1];
+            size_t AvKN = AvKs[i-1] * AvNs[i-1];
+            size_t AvMN = AvMs[i-1] * AvNs[i-1];
+            cudap1ptrs.Abp[i] =cudap1ptrs.Abp[i-1] + AvMK;
+            cudap1ptrs.xbp[i] = cudap1ptrs.xbp[i-1] + AvKN;
+            cudap1ptrs.ybp[i] = cudap1ptrs.ybp[i-1] + AvMN;
+        }
+        // load phase1 A,x to GPU
+        CopyDataB2HD((HostType*)cudap1ptrs.A, tlrmvmcpu->p1ptrs.A, tlrmvmcpu->p1ptrs.Acnt);
+        CopyDataB2HD((HostType*)cudap1ptrs.x, tlrmvmcpu->p1ptrs.x, tlrmvmcpu->p1ptrs.Xcnt);
+    }
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::Phase1CopyDataTranspose(){
+        for(int i=1; i<cudap1transptrs.Ms.size(); i++){
+            size_t AvMK = cudap1transptrs.Ms[i-1] * cudap1transptrs.Ks[i-1];
+            size_t AvKN = cudap1transptrs.Ks[i-1] * cudap1transptrs.Ns[i-1];
+            size_t AvMN = cudap1transptrs.Ms[i-1] * cudap1transptrs.Ns[i-1];
+            cudap1transptrs.Abp[i] = cudap1transptrs.Abp[i-1] + AvMK;
+            cudap1transptrs.xbp[i] = cudap1transptrs.xbp[i-1] + AvKN;
+            cudap1transptrs.ybp[i] = cudap1transptrs.ybp[i-1] + AvMN;
+        }
+        // no need to copy data. data is copied in normal node.
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::Phase2(){
+        hipDeviceSynchronize();
+        phase2_nosplit<DeviceType>(cudap1ptrs.y, d_phase2mapping, cudap3ptrs.x,
+                                   config.granksum, streamptr[0]);
+        hipDeviceSynchronize();
+    }
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::Phase2Transpose(){
+        hipDeviceSynchronize();
+        phase2_nosplit<DeviceType>(cudap1transptrs.y, d_phase2mapping_transpose,
+                                   cudap3transptrs.x, config.granksum, streamptr[0]);
+        hipDeviceSynchronize();
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::Phase2Prepare(){
+        GetDeviceMemory(&d_phase2mapping, tlrmvmcpu->h_phase2mapping.size());
+        CopyDataB2HD(d_phase2mapping, tlrmvmcpu->h_phase2mapping.data(),tlrmvmcpu->h_phase2mapping.size());
+    }
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::Phase2PrepareTranspose(){
+        GetDeviceMemory(&d_phase2mapping_transpose, tlrmvmcpu->h_phase2mappingTranspose.size());
+        CopyDataB2HD(d_phase2mapping_transpose, tlrmvmcpu->h_phase2mappingTranspose.data(),
+                     tlrmvmcpu->h_phase2mappingTranspose.size());
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::Phase3(){
+        hipDeviceSynchronize();
+        for(int i=0; i<config.Mtg; i++){
+            hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N,
+                       cudap3ptrs.Ms[i], cudap3ptrs.Ks[i],
+                       &alpha, (const DeviceType*)cudap3ptrs.Abp[i], cudap3ptrs.Ms[i],
+                       (const DeviceType*)cudap3ptrs.xbp[i], 1, &beta,cudap3ptrs.ybp[i], 1);
+        }
+        hipDeviceSynchronize();
+    }
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::Phase3Transpose(){
+        hipDeviceSynchronize();
+        for(int i=0; i<config.Mtg; i++){
+            hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T,
+                       cudap3transptrs.Ks[i], cudap3transptrs.Ms[i],
+                       &alpha, (const DeviceType*)cudap3transptrs.Abp[i],
+                       cudap3transptrs.Ks[i],
+                       (const DeviceType*)cudap3transptrs.xbp[i], 1, &beta,cudap3transptrs.ybp[i], 1);
+        }
+        hipDeviceSynchronize();
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::Phase3GetMembuffer(){
+        int batchsize = cudap3ptrs.Ms.size();
+        GethipHostMemory(&cudap3ptrs.Abp, batchsize);
+        GethipHostMemory(&cudap3ptrs.xbp, batchsize);
+        GethipHostMemory(&cudap3ptrs.ybp, batchsize);
+    }
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::Phase3GetMembufferTranspose(){
+        int batchsize = cudap3transptrs.Ms.size();
+        GethipHostMemory(&cudap3transptrs.Abp, batchsize);
+        GethipHostMemory(&cudap3transptrs.xbp, batchsize);
+        GethipHostMemory(&cudap3transptrs.ybp, batchsize);
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::AllocatePhase3Buffer(){
+        GetDeviceMemory(&cudap3ptrs.A, cudap3ptrs.Acnt);
+        GetDeviceMemory(&cudap3ptrs.x, cudap3ptrs.Xcnt);
+        GetDeviceMemory(&cudap3ptrs.y, cudap3ptrs.Ycnt);
+        cudap3ptrs.Abp[0] = cudap3ptrs.A;
+        cudap3ptrs.xbp[0] = cudap3ptrs.x;
+        cudap3ptrs.ybp[0] = cudap3ptrs.y;
+    }
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::AllocatePhase3BufferTranspose(){
+        cudap3transptrs.A = cudap1ptrs.A;
+        cudap3transptrs.x = cudap3ptrs.x;
+        GetDeviceMemory(&cudap3transptrs.y, cudap3transptrs.Ycnt);
+        cudap3transptrs.Abp[0] = cudap1ptrs.A; // use phase 1, V bases
+        cudap3transptrs.xbp[0] = cudap3ptrs.x; // use phase 3, x
+        cudap3transptrs.ybp[0] = cudap3transptrs.y; // create a new buffer
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::Phase3CopyData(){
+        auto AuMs = cudap3ptrs.Ms;
+        auto AuNs = cudap3ptrs.Ns;
+        auto AuKs = cudap3ptrs.Ks;
+        for(int i=1; i<tlrmvmcpu->config.Mtg; i++){
+            size_t AuMK = AuMs[i-1] * AuKs[i-1];
+            size_t AuKN = AuKs[i-1] * AuNs[i-1];
+            size_t AuMN = AuMs[i-1] * AuNs[i-1];
+            cudap3ptrs.Abp[i] = cudap3ptrs.Abp[i-1] + AuMK;
+            cudap3ptrs.xbp[i] = cudap3ptrs.xbp[i-1] + AuKN;
+            cudap3ptrs.ybp[i] = cudap3ptrs.ybp[i-1] + AuMN;
+        }
+        // load phase 3 A to GPU
+        CopyDataB2HD((HostType*)cudap3ptrs.A, tlrmvmcpu->p3ptrs.A, cudap3ptrs.Acnt);
+    }
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::Phase3CopyDataTranspose(){
+        for(int i=1; i<cudap3transptrs.Ms.size(); i++){
+            size_t AvMK = cudap3transptrs.Ms[i-1] * cudap3transptrs.Ks[i-1];
+            size_t AvKN = cudap3transptrs.Ks[i-1] * cudap3transptrs.Ns[i-1];
+            size_t AvMN = cudap3transptrs.Ms[i-1] * cudap3transptrs.Ns[i-1];
+            cudap3transptrs.Abp[i] = cudap3transptrs.Abp[i-1] + AvMK;
+            cudap3transptrs.xbp[i] = cudap3transptrs.xbp[i-1] + AvKN;
+            cudap3transptrs.ybp[i] = cudap3transptrs.ybp[i-1] + AvMN;
+        }
+        // no need to copy data.
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType, DeviceType>::MVMGraphTranspose() {
+        if(!graphCreated){
+            hipStreamBeginCapture(streamptr[0],hipStreamCaptureModeGlobal);
+            hipEventRecord(event_start, streamptr[0]);
+            for(int streami=1; streami<stream_size; streami++){
+                hipStreamWaitEvent(streamptr[streami], event_start,0);
+            }
+            // phase 1 transpose
+            for(int i=0; i<config.Ntg; i++){
+                if(cudap1transptrs.Ms[i] != 0){
+                    hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T,
+                               cudap1transptrs.Ks[i], cudap1transptrs.Ms[i],
+                               &alpha, (const DeviceType*)cudap1transptrs.Abp[i],
+                               cudap1transptrs.Ks[i],
+                               (const DeviceType*)cudap1transptrs.xbp[i], 1, &beta,
+                               cudap1transptrs.ybp[i], 1);
+                }
+            }
+            for(int streami=1; streami < stream_size; streami++){
+                hipEventRecord(events[streami], streamptr[streami]);
+            }
+            for(int streami=1; streami < stream_size; streami++){
+                hipStreamWaitEvent(streamptr[0], events[streami],0);
+            }
+            // phase 2 transpose
+            phase2_nosplit<DeviceType>(cudap1transptrs.y, d_phase2mapping_transpose,
+                                       cudap3transptrs.x, config.granksum, streamptr[0]);
+            hipEventRecord(events[0], streamptr[0]);
+            for(int streami=1; streami < stream_size; streami++){
+                hipStreamWaitEvent(streamptr[streami], events[0],0);
+            }
+            // phase 3 transpose
+            for(int i=0; i<config.Mtg; i++){
+                hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T,
+                           cudap3transptrs.Ks[i], cudap3transptrs.Ms[i],
+                           &alpha, (const DeviceType*)cudap3transptrs.Abp[i],
+                           cudap3transptrs.Ks[i],
+                           (const DeviceType*)cudap3transptrs.xbp[i], 1, &beta,cudap3transptrs.ybp[i], 1);
+            }
+            // final merge
+            for(int streami=1; streami < stream_size; streami++){
+                hipEventRecord(events[stream_size + streami], streamptr[streami]);
+            }
+            for(int streami=1; streami < stream_size; streami++){
+                hipStreamWaitEvent(streamptr[0], events[stream_size + streami],0);
+            }
+            hipStreamEndCapture(streamptr[0], &graph);
+            hipGraphInstantiate(&instance, graph, nullptr, nullptr, 0);
+            graphCreated = true;
+        }
+        hipGraphLaunch(instance, streamptr[0]);
+        hipStreamSynchronize(streamptr[0]);
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType, DeviceType>::MVMGraphNoTranspose() {
+        if(!graphCreated){
+            hipStreamBeginCapture(streamptr[0],hipStreamCaptureModeGlobal);
+            hipEventRecord(event_start, streamptr[0]);
+            for(int streami=1; streami<stream_size; streami++){
+                hipStreamWaitEvent(streamptr[streami], event_start,0);
+            }
+            // phase 1
+            for(int i=0; i<config.Ntg; i++){
+                if(cudap1ptrs.Ms[i] != 0){
+                    hipblasgemm(cublashandleptr[i%stream_size],HIPBLAS_OP_N, HIPBLAS_OP_N,
+                                cudap1ptrs.Ms[i], cudap1ptrs.Ns[i], cudap1ptrs.Ks[i],
+                                &alpha, (const DeviceType*)cudap1ptrs.Abp[i], cudap1ptrs.Ms[i],
+                                (const DeviceType*)cudap1ptrs.xbp[i], cudap1ptrs.Ks[i],
+                                &beta, cudap1ptrs.ybp[i], cudap1ptrs.Ms[i]);
+//                    hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N,
+//                               cudap1ptrs.Ms[i], cudap1ptrs.Ks[i],
+//                               &alpha, (const DeviceType*)cudap1ptrs.Abp[i], cudap1ptrs.Ms[i],
+//                               (const DeviceType*)cudap1ptrs.xbp[i], 1, &beta,
+//                               cudap1ptrs.ybp[i], 1);
+                }
+            }
+            for(int streami=1; streami < stream_size; streami++){
+                hipEventRecord(events[streami], streamptr[streami]);
+            }
+            for(int streami=1; streami < stream_size; streami++){
+                hipStreamWaitEvent(streamptr[0], events[streami],0);
+            }
+            // phase 2
+            phase2_nosplit<DeviceType>(cudap1ptrs.y, d_phase2mapping, cudap3ptrs.x,
+                                       config.granksum, streamptr[0]);
+            hipEventRecord(events[0], streamptr[0]);
+            for(int streami=1; streami < stream_size; streami++){
+                hipStreamWaitEvent(streamptr[streami], events[0],0);
+            }
+            // phase 3
+            for(int i=0; i<config.Mtg; i++){
+                hipblasgemm(cublashandleptr[i%stream_size],HIPBLAS_OP_N, HIPBLAS_OP_N,
+                            cudap3ptrs.Ms[i], cudap3ptrs.Ns[i], cudap3ptrs.Ks[i],
+                            &alpha, (const DeviceType*)cudap3ptrs.Abp[i], cudap3ptrs.Ms[i],
+                            (const DeviceType*)cudap3ptrs.xbp[i], cudap3ptrs.Ks[i],
+                            &beta,cudap3ptrs.ybp[i], cudap3ptrs.Ms[i]);
+//                hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N,
+//                           cudap3ptrs.Ms[i], cudap3ptrs.Ks[i],
+//                           &alpha, (const DeviceType*)cudap3ptrs.Abp[i], cudap3ptrs.Ms[i],
+//                           (const DeviceType*)cudap3ptrs.xbp[i], 1, &beta,cudap3ptrs.ybp[i], 1);
+            }
+            // final merge
+            for(int streami=1; streami < stream_size; streami++){
+                hipEventRecord(events[stream_size + streami], streamptr[streami]);
+            }
+            for(int streami=1; streami < stream_size; streami++){
+                hipStreamWaitEvent(streamptr[0], events[stream_size + streami],0);
+            }
+            hipStreamEndCapture(streamptr[0], &graph);
+            hipGraphInstantiate(&instance, graph, nullptr, nullptr, 0);
+            graphCreated = true;
+        }
+        hipGraphLaunch(instance, streamptr[0]);
+        hipStreamSynchronize(streamptr[0]);
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::MVMGraph(){
+        if(transpose){
+            MVMGraphTranspose();
+        }else{
+            MVMGraphNoTranspose();
+        }
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::MVMTranspose()
+    {
+        hipDeviceSynchronize();
+        for(int i=0; i<config.Ntg; i++){
+            if(cudap1transptrs.Ms[i] != 0){
+                hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T,
+                           cudap1transptrs.Ks[i], cudap1transptrs.Ms[i],
+                           &alpha, (const DeviceType*)cudap1transptrs.Abp[i],
+                           cudap1transptrs.Ks[i],
+                           (const DeviceType*)cudap1transptrs.xbp[i], 1, &beta,
+                           cudap1transptrs.ybp[i], 1);
+            }
+        }
+        hipDeviceSynchronize();
+        phase2_nosplit<DeviceType>(cudap1transptrs.y, d_phase2mapping_transpose,
+                                   cudap3transptrs.x, config.granksum, streamptr[0]);
+        hipDeviceSynchronize();
+        for(int i=0; i<config.Mtg; i++){
+            hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_T,
+                       cudap3transptrs.Ks[i], cudap3transptrs.Ms[i],
+                       &alpha, (const DeviceType*)cudap3transptrs.Abp[i],
+                       cudap3transptrs.Ks[i],
+                       (const DeviceType*)cudap3transptrs.xbp[i], 1, &beta,cudap3transptrs.ybp[i], 1);
+        }
+        hipDeviceSynchronize();
+    }
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType,DeviceType>::MVMNoTranspose()
+    {
+        hipDeviceSynchronize();
+        for(int i=0; i<config.Ntg; i++){
+            if(cudap1ptrs.Ms[i] != 0){
+                hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N,
+                           cudap1ptrs.Ms[i], cudap1ptrs.Ks[i],
+                           &alpha, (const DeviceType*)cudap1ptrs.Abp[i], cudap1ptrs.Ms[i],
+                           (const DeviceType*)cudap1ptrs.xbp[i], 1, &beta,
+                           cudap1ptrs.ybp[i], 1);
+            }
+        }
+        hipDeviceSynchronize();
+        phase2_nosplit<DeviceType>(cudap1ptrs.y, d_phase2mapping, cudap3ptrs.x,
+                                   config.granksum, streamptr[0]);
+        hipDeviceSynchronize();
+        for(int i=0; i<config.Mtg; i++){
+            hipblasgemv(cublashandleptr[i%stream_size], HIPBLAS_OP_N,
+                       cudap3ptrs.Ms[i], cudap3ptrs.Ks[i],
+                       &alpha, (const DeviceType*)cudap3ptrs.Abp[i], cudap3ptrs.Ms[i],
+                       (const DeviceType*)cudap3ptrs.xbp[i], 1, &beta,cudap3ptrs.ybp[i], 1);
+        }
+        hipDeviceSynchronize();
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType, DeviceType>::CopyBackResults()
+    {
+        // use cpu pointers to send output
+        if(transpose){
+            CopyDataB2HD(tlrmvmcpu->p1transptrs.y, (HostType*)cudap1transptrs.y, tlrmvmcpu->config.granksum);
+            CopyDataB2HD(tlrmvmcpu->p3transptrs.x, (HostType*)cudap3transptrs.x, tlrmvmcpu->config.granksum);
+            CopyDataB2HD(tlrmvmcpu->p3transptrs.y, (HostType*)cudap3transptrs.y, tlrmvmcpu->config.originM);
+        }else{
+            CopyDataB2HD(tlrmvmcpu->p1ptrs.y, (HostType*)cudap1ptrs.y, tlrmvmcpu->config.granksum);
+            CopyDataB2HD(tlrmvmcpu->p3ptrs.x, (HostType*)cudap3ptrs.x, tlrmvmcpu->config.granksum);
+            CopyDataB2HD(tlrmvmcpu->p3ptrs.y, (HostType*)cudap3ptrs.y, tlrmvmcpu->config.originM);
+        }
+        tlrmvmcpu->CopyToFinalresults();
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType, DeviceType>::MVM() {
+        if(transpose){
+            MVMTranspose();
+        }else{
+            MVMNoTranspose();
+        }
+    }
+
+    template<typename HostType, typename DeviceType>
+    void Tlrmvmhip<HostType, DeviceType>::SetTransposeConjugate(bool transpose, bool conjugate) {
+        this->transpose = transpose;
+        this->conjugate = conjugate;
+        tlrmvmcpu->SetTransposeConjugate(transpose, conjugate);
+    }
+
+
+
+    template class Tlrmvmhip<float, float>;
+    template class Tlrmvmhip<double, double>;
+    template class Tlrmvmhip<complex<float>, hipComplex>;
+    template class Tlrmvmhip<complex<double>, hipDoubleComplex>;
+
+} // namespace cudatlrmvm
+
diff --git a/src/tlrmvm/hip/Tlrmvmhip.hpp b/src/tlrmvm/hip/Tlrmvmhip.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..631a0dfeecb7de1a1e09d9bfccfe93eccdf0ca87
--- /dev/null
+++ b/src/tlrmvm/hip/Tlrmvmhip.hpp
@@ -0,0 +1,114 @@
+#pragma once
+
+#include "../../common/Common.hpp"
+#include "../cpu/TlrmvmCPU.hpp"
+#include "../../common/hip/Util.hpp"
+#include <cassert>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+#include <memory>
+
+namespace hiptlrmvm
+{
+    template<typename T>
+    struct HIPPhasePointers{
+        HIPPhasePointers();
+        size_t Acnt;
+        size_t Xcnt;
+        size_t Ycnt;
+        vector<size_t> Ms;
+        vector<size_t> Ks;
+        vector<size_t> Ns;
+        T *A;
+        T *x;
+        T *y;
+        T **Abp;
+        T **xbp;
+        T **ybp;
+    };
+
+    template<typename SrcType, typename DestType>
+    void PhasePointersCopyNonPointers(HIPPhasePointers<DestType> &dest, const PhasePointers<SrcType> &src);
+
+    // Tlrmvm cuda is only responsible for cuda memory ops.
+    // Any host memory related ops should go to CPU instance.
+    template<typename HostType, typename DeviceType>
+    class Tlrmvmhip
+    {
+    public:
+        explicit Tlrmvmhip(TlrmvmConfig tlrmvmconfig);
+        Tlrmvmhip();
+        void UpdateConfig(TlrmvmConfig &tlrmvmConfig);
+        void SetTransposeConjugate(bool transpose, bool conjugate);
+        void StreamInit(int streamsize);
+        void StreamDestroy();
+        void MemoryInit();
+        void MemoryFree();
+        void Phase1();
+        void Phase1Transpose();
+        void Phase1GetMembuffer();
+        void AllocatePhase1Buffer();
+        void Phase1CopyData();
+        void Phase1GetMembufferTranspose();
+        void AllocatePhase1BufferTranspose();
+        void Phase1CopyDataTranspose();
+        void Phase2();
+        void Phase2Transpose();
+        void Phase2Prepare();
+        void Phase2PrepareTranspose();
+        void Phase3();
+        void Phase3Transpose();
+        void Phase3GetMembuffer();
+        void AllocatePhase3Buffer();
+        void Phase3CopyData();
+        void Phase3GetMembufferTranspose();
+        void AllocatePhase3BufferTranspose();
+        void Phase3CopyDataTranspose();
+        void MVM();
+        void MVMTranspose();
+        void MVMNoTranspose();
+        void MVMGraph();
+        void MVMGraphTranspose();
+        void MVMGraphNoTranspose();
+        void setX(HostType * xvector, size_t xlength);
+        // seperate 2 functions.
+        void TryConjugateXvec();
+        void TryConjugateResults();
+        void CopyBackResults();
+
+        bool transpose;
+        bool conjugate;
+        // cpu instance
+        TlrmvmConfig config;
+        shared_ptr<TlrmvmCPU<HostType>> tlrmvmcpu;
+        // GPU resources
+        hipStream_t * streamptr;
+        hipblasHandle_t * cublashandleptr;
+        int stream_size;
+        hipGraph_t graph;
+        bool graphCreated;
+        hipGraphExec_t instance;
+        hipEvent_t *events;
+        hipEvent_t event_start;
+        hipEvent_t event_phase2finish;
+
+        DeviceType alpha;
+        DeviceType beta;
+
+        // gpu pointers
+        HIPPhasePointers<DeviceType> cudap1ptrs;
+        HIPPhasePointers<DeviceType> cudap1transptrs;
+        size_t *d_phase2mapping;
+        size_t *d_phase2mapping_transpose;
+        HIPPhasePointers<DeviceType> cudap3ptrs;
+        HIPPhasePointers<DeviceType> cudap3transptrs;
+    };
+
+
+
+
+} //
+
+
+
diff --git a/src/tlrmvm/hip/TlrmvmhipConstRank.cpp b/src/tlrmvm/hip/TlrmvmhipConstRank.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..379f2ff4909c3f0837f945b525557ae2cfc1e0d1
--- /dev/null
+++ b/src/tlrmvm/hip/TlrmvmhipConstRank.cpp
@@ -0,0 +1,484 @@
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+
+#include "../../common/Common.hpp"
+#include "../../common/AppUtil.hpp"
+#include "../cpu/TlrmvmCPU.hpp"
+#include "Tlrmvmhip.hpp"
+#include "TlrmvmhipConstRank.hpp"
+#include "hipkernel.cuh"
+
+namespace hiptlrmvm
+{
+    template<typename HostType, typename DeviceType>
+    TlrmvmhipConstRank<HostType, DeviceType>::TlrmvmhipConstRank() {}
+
+    template<typename HostType, typename DeviceType>
+    TlrmvmhipConstRank<HostType, DeviceType>::TlrmvmhipConstRank(TlrmvmConfig tlrmvmconfig)
+            :config(tlrmvmconfig)
+    {
+        transpose = false;
+        conjugate = false;
+        init_alpha_beta(alpha, beta);
+        tlrmvmcpu = std::make_shared<TlrmvmCPU<HostType>>(tlrmvmconfig);
+    }
+
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType, DeviceType>::UpdateConfig(TlrmvmConfig &tlrmvmconfig)
+    {
+//        transpose = false;
+//        conjugate = false;
+//        init_alpha_beta(alpha, beta);
+//        tlrmvmcpu->UpdateConfig(tlrmvmconfig);
+        cout << "UpdateConfig not implemented." << endl;
+        exit(0);
+    }
+
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::setX(HostType * xvector, size_t xlength){
+        tlrmvmcpu->setX(xvector, xlength);
+        tlrmvmcpu->TryConjugateXvec();
+    }
+
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::TryConjugateXvec() {
+        // no transpose logic
+        tlrmvmcpu->TryConjugateXvec();
+        CopyDataB2HD((HostType*)this->cudap1ptrs.x, tlrmvmcpu->p1ptrs.x, tlrmvmcpu->xmat.Shape()[0]);
+    }
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::TryConjugateResults() {
+        if(!conjugate) return;
+        if(transpose){
+            ConjugateDriver<DeviceType>(cudap3transptrs.y, config.originN, stream);
+            CopyDataB2HD(tlrmvmcpu->p3transptrs.y, (HostType*)cudap3transptrs.y, tlrmvmcpu->config.originM);
+        }else{
+            ConjugateDriver<DeviceType>(cudap3ptrs.y, config.originM, stream);
+            CopyDataB2HD(tlrmvmcpu->p3ptrs.y, (HostType*)cudap3ptrs.y, tlrmvmcpu->config.originM);
+        }
+    }
+
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::StreamInit(int streamsize){
+        hipStreamCreateWithFlags(&stream, hipStreamNonBlocking);
+        hipblasCreate(&cublashandle);
+        hipblasSetStream(cublashandle, stream);
+    }
+
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::StreamDestroy(){
+        hipblasDestroy(cublashandle);
+        hipStreamDestroy(stream);
+    }
+
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::MemoryInit(){
+        tlrmvmcpu->MemoryInit();
+        PhasePointersCopyNonPointers<HostType, DeviceType>(cudap1ptrs, tlrmvmcpu->p1ptrs);
+        PhasePointersCopyNonPointers<HostType, DeviceType>(cudap3ptrs, tlrmvmcpu->p3ptrs);
+        PhasePointersCopyNonPointers<HostType, DeviceType>(cudap1transptrs, tlrmvmcpu->p1transptrs);
+        PhasePointersCopyNonPointers<HostType, DeviceType>(cudap3transptrs, tlrmvmcpu->p3transptrs);
+        Phase1GetMembuffer();
+        AllocatePhase1Buffer();
+        Phase1CopyData();
+        Phase2Prepare();
+        Phase3GetMembuffer();
+        AllocatePhase3Buffer();
+        Phase3CopyData();
+        // transpose
+        Phase1GetMembufferTranspose();
+        AllocatePhase1BufferTranspose();
+        Phase1CopyDataTranspose();
+        Phase2PrepareTranspose();
+        Phase3GetMembufferTranspose();
+        AllocatePhase3BufferTranspose();
+        Phase3CopyDataTranspose();
+
+        // init batch pointers
+        GetDeviceMemory(&d_p1Aptrs, cudap1ptrs.Ms.size());
+        GetDeviceMemory(&d_p1xptrs, cudap1ptrs.Ms.size());
+        GetDeviceMemory(&d_p1yptrs, cudap1ptrs.Ms.size());
+        GetDeviceMemory(&d_p3Aptrs, cudap3ptrs.Ms.size());
+        GetDeviceMemory(&d_p3xptrs, cudap3ptrs.Ms.size());
+        GetDeviceMemory(&d_p3yptrs, cudap3ptrs.Ms.size());
+
+        CopyDataB2HD((HostType**)d_p1Aptrs, (HostType**)cudap1ptrs.Abp, cudap1ptrs.Ms.size());
+        CopyDataB2HD((HostType**)d_p1xptrs, (HostType**)cudap1ptrs.xbp, cudap1ptrs.Ms.size());
+        CopyDataB2HD((HostType**)d_p1yptrs, (HostType**)cudap1ptrs.ybp, cudap1ptrs.Ms.size());
+        CopyDataB2HD((HostType**)d_p3Aptrs, (HostType**)cudap3ptrs.Abp, cudap3ptrs.Ms.size());
+        CopyDataB2HD((HostType**)d_p3xptrs, (HostType**)cudap3ptrs.xbp, cudap3ptrs.Ms.size());
+        CopyDataB2HD((HostType**)d_p3yptrs, (HostType**)cudap3ptrs.ybp, cudap3ptrs.Ms.size());
+
+        GetDeviceMemory(&d_p1transAptrs, cudap1transptrs.Ms.size());
+        GetDeviceMemory(&d_p1transxptrs, cudap1transptrs.Ms.size());
+        GetDeviceMemory(&d_p1transyptrs, cudap1transptrs.Ms.size());
+        GetDeviceMemory(&d_p3transAptrs, cudap3transptrs.Ms.size());
+        GetDeviceMemory(&d_p3transxptrs, cudap3transptrs.Ms.size());
+        GetDeviceMemory(&d_p3transyptrs, cudap3transptrs.Ms.size());
+
+        CopyDataB2HD((HostType**)d_p1transAptrs, (HostType**)cudap1transptrs.Abp, cudap1transptrs.Ms.size());
+        CopyDataB2HD((HostType**)d_p1transxptrs, (HostType**)cudap1transptrs.xbp, cudap1transptrs.Ms.size());
+        CopyDataB2HD((HostType**)d_p1transyptrs, (HostType**)cudap1transptrs.ybp, cudap1transptrs.Ms.size());
+        CopyDataB2HD((HostType**)d_p3transAptrs, (HostType**)cudap3transptrs.Abp, cudap3transptrs.Ms.size());
+        CopyDataB2HD((HostType**)d_p3transxptrs, (HostType**)cudap3transptrs.xbp, cudap3transptrs.Ms.size());
+        CopyDataB2HD((HostType**)d_p3transyptrs, (HostType**)cudap3transptrs.ybp, cudap3transptrs.Ms.size());
+    }
+
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::MemoryFree(){
+        tlrmvmcpu->MemoryFree();
+        FreehipHostMemory(cudap1ptrs.Abp);
+        FreehipHostMemory(cudap1ptrs.xbp);
+        FreehipHostMemory(cudap1ptrs.ybp);
+        FreeDeviceMemory(cudap1ptrs.A);
+        FreeDeviceMemory(cudap1ptrs.x);
+        FreeDeviceMemory(cudap1ptrs.y);
+
+        FreehipHostMemory(cudap3ptrs.Abp);
+        FreehipHostMemory(cudap3ptrs.xbp);
+        FreehipHostMemory(cudap3ptrs.ybp);
+        FreeDeviceMemory(cudap3ptrs.A);
+        FreeDeviceMemory(cudap3ptrs.x);
+        FreeDeviceMemory(cudap3ptrs.y);
+
+        FreehipHostMemory(cudap1transptrs.Abp);
+        FreehipHostMemory(cudap1transptrs.xbp);
+        FreehipHostMemory(cudap1transptrs.ybp);
+        FreeDeviceMemory(cudap1transptrs.y);
+
+        FreehipHostMemory(cudap3transptrs.Abp);
+        FreehipHostMemory(cudap3transptrs.xbp);
+        FreehipHostMemory(cudap3transptrs.ybp);
+        FreeDeviceMemory(cudap3transptrs.y);
+
+        FreeDeviceMemory(d_p1Aptrs);
+        FreeDeviceMemory(d_p1xptrs);
+        FreeDeviceMemory(d_p1yptrs);
+        FreeDeviceMemory(d_p3Aptrs);
+        FreeDeviceMemory(d_p3xptrs);
+        FreeDeviceMemory(d_p3yptrs);
+
+        FreeDeviceMemory(d_p1transAptrs);
+        FreeDeviceMemory(d_p1transxptrs);
+        FreeDeviceMemory(d_p1transyptrs);
+        FreeDeviceMemory(d_p3transAptrs);
+        FreeDeviceMemory(d_p3transxptrs);
+        FreeDeviceMemory(d_p3transyptrs);
+    }
+
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::Phase1(){
+        hipDeviceSynchronize();
+        for(int i=0; i<config.Ntg; i++){
+            if(cudap1ptrs.Ms[i] != 0){
+                hipblasgemv(cublashandle, HIPBLAS_OP_N,
+                            cudap1ptrs.Ms[i], cudap1ptrs.Ks[i],
+                            &alpha, (const DeviceType*)cudap1ptrs.Abp[i], cudap1ptrs.Ms[i],
+                            (const DeviceType*)cudap1ptrs.xbp[i], 1, &beta,
+                            cudap1ptrs.ybp[i], 1);
+            }
+        }
+        hipDeviceSynchronize();
+    }
+
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::Phase1Transpose(){
+        hipDeviceSynchronize();
+        for(int i=0; i<config.Ntg; i++){
+            if(cudap1transptrs.Ms[i] != 0){
+                hipblasgemv(cublashandle, HIPBLAS_OP_T,
+                            cudap1transptrs.Ks[i], cudap1transptrs.Ms[i],
+                            &alpha, (const DeviceType*)cudap1transptrs.Abp[i],
+                            cudap1transptrs.Ks[i],
+                            (const DeviceType*)cudap1transptrs.xbp[i], 1, &beta,
+                            cudap1transptrs.ybp[i], 1);
+            }
+        }
+        hipDeviceSynchronize();
+    }
+
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::Phase1GetMembuffer(){
+        int batchsize = cudap1ptrs.Ms.size();
+        GethipHostMemory(&cudap1ptrs.Abp, batchsize);
+        GethipHostMemory(&cudap1ptrs.xbp, batchsize);
+        GethipHostMemory(&cudap1ptrs.ybp, batchsize);
+    }
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::Phase1GetMembufferTranspose()
+    {
+        int batchsize = cudap1ptrs.Ms.size();
+        GethipHostMemory(&cudap1transptrs.Abp, batchsize);
+        GethipHostMemory(&cudap1transptrs.xbp, batchsize);
+        GethipHostMemory(&cudap1transptrs.ybp, batchsize);
+    }
+
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::AllocatePhase1Buffer(){
+        GetDeviceMemory(&cudap1ptrs.A, cudap1ptrs.Acnt);
+        GetDeviceMemory(&cudap1ptrs.x, cudap1ptrs.Xcnt);
+        GetDeviceMemory(&cudap1ptrs.y, cudap1ptrs.Ycnt);
+        cudap1ptrs.Abp[0] = cudap1ptrs.A;
+        cudap1ptrs.xbp[0] = cudap1ptrs.x;
+        cudap1ptrs.ybp[0] = cudap1ptrs.y;
+    }
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::AllocatePhase1BufferTranspose(){
+        cudap1transptrs.A = cudap3ptrs.A;
+        cudap1transptrs.x = cudap1ptrs.x;
+        GetDeviceMemory(&cudap1transptrs.y, cudap1transptrs.Ycnt);
+        cudap1transptrs.Abp[0] = cudap3ptrs.A; // use phase 3, U bases
+        cudap1transptrs.xbp[0] = cudap1ptrs.x; // use phase 1, x
+        cudap1transptrs.ybp[0] = cudap1transptrs.y; // create a new buffer
+    }
+
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::Phase1CopyData(){
+        auto AvMs = cudap1ptrs.Ms;
+        auto AvNs = cudap1ptrs.Ns;
+        auto AvKs = cudap1ptrs.Ks;
+        for(int i=1; i<config.Ntg; i++){
+            size_t AvMK = AvMs[i-1] * AvKs[i-1];
+            size_t AvKN = AvKs[i-1] * AvNs[i-1];
+            size_t AvMN = AvMs[i-1] * AvNs[i-1];
+            cudap1ptrs.Abp[i] =cudap1ptrs.Abp[i-1] + AvMK;
+            cudap1ptrs.xbp[i] = cudap1ptrs.xbp[i-1] + AvKN;
+            cudap1ptrs.ybp[i] = cudap1ptrs.ybp[i-1] + AvMN;
+        }
+        // load phase1 A,x to GPU
+        CopyDataB2HD((HostType*)cudap1ptrs.A, tlrmvmcpu->p1ptrs.A, tlrmvmcpu->p1ptrs.Acnt);
+        CopyDataB2HD((HostType*)cudap1ptrs.x, tlrmvmcpu->p1ptrs.x, tlrmvmcpu->p1ptrs.Xcnt);
+    }
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::Phase1CopyDataTranspose(){
+        for(int i=1; i<cudap1transptrs.Ms.size(); i++){
+            size_t AvMK = cudap1transptrs.Ms[i-1] * cudap1transptrs.Ks[i-1];
+            size_t AvKN = cudap1transptrs.Ks[i-1] * cudap1transptrs.Ns[i-1];
+            size_t AvMN = cudap1transptrs.Ms[i-1] * cudap1transptrs.Ns[i-1];
+            cudap1transptrs.Abp[i] = cudap1transptrs.Abp[i-1] + AvMK;
+            cudap1transptrs.xbp[i] = cudap1transptrs.xbp[i-1] + AvKN;
+            cudap1transptrs.ybp[i] = cudap1transptrs.ybp[i-1] + AvMN;
+        }
+        // no need to copy data. data is copied in normal node.
+    }
+
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::Phase2(){
+        hipDeviceSynchronize();
+        phase2_nosplit<DeviceType>(cudap1ptrs.y, d_phase2mapping, cudap3ptrs.x,
+                                   config.granksum, stream);
+        hipDeviceSynchronize();
+    }
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::Phase2Transpose(){
+        hipDeviceSynchronize();
+        phase2_nosplit<DeviceType>(cudap1transptrs.y, d_phase2mapping_transpose,
+                                   cudap3transptrs.x, config.granksum, stream);
+        hipDeviceSynchronize();
+    }
+
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::Phase2Prepare(){
+        GetDeviceMemory(&d_phase2mapping, tlrmvmcpu->h_phase2mapping.size());
+        CopyDataB2HD(d_phase2mapping, tlrmvmcpu->h_phase2mapping.data(),tlrmvmcpu->h_phase2mapping.size());
+    }
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::Phase2PrepareTranspose(){
+        GetDeviceMemory(&d_phase2mapping_transpose, tlrmvmcpu->h_phase2mappingTranspose.size());
+        CopyDataB2HD(d_phase2mapping_transpose, tlrmvmcpu->h_phase2mappingTranspose.data(),
+                     tlrmvmcpu->h_phase2mappingTranspose.size());
+    }
+
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::Phase3(){
+        hipDeviceSynchronize();
+        for(int i=0; i<config.Mtg; i++){
+            hipblasgemv(cublashandle, HIPBLAS_OP_N,
+                        cudap3ptrs.Ms[i], cudap3ptrs.Ks[i],
+                        &alpha, (const DeviceType*)cudap3ptrs.Abp[i], cudap3ptrs.Ms[i],
+                        (const DeviceType*)cudap3ptrs.xbp[i], 1, &beta,cudap3ptrs.ybp[i], 1);
+        }
+        hipDeviceSynchronize();
+    }
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::Phase3Transpose(){
+        hipDeviceSynchronize();
+        for(int i=0; i<config.Mtg; i++){
+            hipblasgemv(cublashandle, HIPBLAS_OP_T,
+                        cudap3transptrs.Ks[i], cudap3transptrs.Ms[i],
+                        &alpha, (const DeviceType*)cudap3transptrs.Abp[i],
+                        cudap3transptrs.Ks[i],
+                        (const DeviceType*)cudap3transptrs.xbp[i], 1, &beta,cudap3transptrs.ybp[i], 1);
+        }
+        hipDeviceSynchronize();
+    }
+
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::Phase3GetMembuffer(){
+        int batchsize = cudap3ptrs.Ms.size();
+        GethipHostMemory(&cudap3ptrs.Abp, batchsize);
+        GethipHostMemory(&cudap3ptrs.xbp, batchsize);
+        GethipHostMemory(&cudap3ptrs.ybp, batchsize);
+    }
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::Phase3GetMembufferTranspose(){
+        int batchsize = cudap3transptrs.Ms.size();
+        GethipHostMemory(&cudap3transptrs.Abp, batchsize);
+        GethipHostMemory(&cudap3transptrs.xbp, batchsize);
+        GethipHostMemory(&cudap3transptrs.ybp, batchsize);
+    }
+
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::AllocatePhase3Buffer(){
+        GetDeviceMemory(&cudap3ptrs.A, cudap3ptrs.Acnt);
+        GetDeviceMemory(&cudap3ptrs.x, cudap3ptrs.Xcnt);
+        GetDeviceMemory(&cudap3ptrs.y, cudap3ptrs.Ycnt);
+        cudap3ptrs.Abp[0] = cudap3ptrs.A;
+        cudap3ptrs.xbp[0] = cudap3ptrs.x;
+        cudap3ptrs.ybp[0] = cudap3ptrs.y;
+    }
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::AllocatePhase3BufferTranspose(){
+        cudap3transptrs.A = cudap1ptrs.A;
+        cudap3transptrs.x = cudap3ptrs.x;
+        GetDeviceMemory(&cudap3transptrs.y, cudap3transptrs.Ycnt);
+        cudap3transptrs.Abp[0] = cudap1ptrs.A; // use phase 1, V bases
+        cudap3transptrs.xbp[0] = cudap3ptrs.x; // use phase 3, x
+        cudap3transptrs.ybp[0] = cudap3transptrs.y; // create a new buffer
+    }
+
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::Phase3CopyData(){
+        auto AuMs = cudap3ptrs.Ms;
+        auto AuNs = cudap3ptrs.Ns;
+        auto AuKs = cudap3ptrs.Ks;
+        for(int i=1; i<tlrmvmcpu->config.Mtg; i++){
+            size_t AuMK = AuMs[i-1] * AuKs[i-1];
+            size_t AuKN = AuKs[i-1] * AuNs[i-1];
+            size_t AuMN = AuMs[i-1] * AuNs[i-1];
+            cudap3ptrs.Abp[i] = cudap3ptrs.Abp[i-1] + AuMK;
+            cudap3ptrs.xbp[i] = cudap3ptrs.xbp[i-1] + AuKN;
+            cudap3ptrs.ybp[i] = cudap3ptrs.ybp[i-1] + AuMN;
+        }
+        // load phase 3 A to GPU
+        CopyDataB2HD((HostType*)cudap3ptrs.A, tlrmvmcpu->p3ptrs.A, cudap3ptrs.Acnt);
+    }
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::Phase3CopyDataTranspose(){
+        for(int i=1; i<cudap3transptrs.Ms.size(); i++){
+            size_t AvMK = cudap3transptrs.Ms[i-1] * cudap3transptrs.Ks[i-1];
+            size_t AvKN = cudap3transptrs.Ks[i-1] * cudap3transptrs.Ns[i-1];
+            size_t AvMN = cudap3transptrs.Ms[i-1] * cudap3transptrs.Ns[i-1];
+            cudap3transptrs.Abp[i] = cudap3transptrs.Abp[i-1] + AvMK;
+            cudap3transptrs.xbp[i] = cudap3transptrs.xbp[i-1] + AvKN;
+            cudap3transptrs.ybp[i] = cudap3transptrs.ybp[i-1] + AvMN;
+        }
+        // no need to copy data.
+    }
+
+
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::MVMTranspose()
+    {
+        hipDeviceSynchronize();
+        for(int i=0; i<config.Ntg; i++){
+            if(cudap1transptrs.Ms[i] != 0){
+                hipblasgemv(cublashandle, HIPBLAS_OP_T,
+                            cudap1transptrs.Ks[i], cudap1transptrs.Ms[i],
+                            &alpha, (const DeviceType*)cudap1transptrs.Abp[i],
+                            cudap1transptrs.Ks[i],
+                            (const DeviceType*)cudap1transptrs.xbp[i], 1, &beta,
+                            cudap1transptrs.ybp[i], 1);
+            }
+        }
+        hipDeviceSynchronize();
+        phase2_nosplit<DeviceType>(cudap1transptrs.y, d_phase2mapping_transpose,
+                                   cudap3transptrs.x, config.granksum, stream);
+        hipDeviceSynchronize();
+        for(int i=0; i<config.Mtg; i++){
+            hipblasgemv(cublashandle, HIPBLAS_OP_T,
+                        cudap3transptrs.Ks[i], cudap3transptrs.Ms[i],
+                        &alpha, (const DeviceType*)cudap3transptrs.Abp[i],
+                        cudap3transptrs.Ks[i],
+                        (const DeviceType*)cudap3transptrs.xbp[i], 1, &beta,cudap3transptrs.ybp[i], 1);
+        }
+        hipDeviceSynchronize();
+    }
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType,DeviceType>::MVMNoTranspose()
+    {
+        hipDeviceSynchronize();
+        hipblasgemmbatched(cublashandle, HIPBLAS_OP_N, HIPBLAS_OP_N,
+                          cudap1ptrs.Ms[0],cudap1ptrs.Ns[0],cudap1ptrs.Ks[0],
+                          &alpha, (const DeviceType**)d_p1Aptrs, cudap1ptrs.Ms[0],
+                          (const DeviceType**)d_p1xptrs, cudap1ptrs.Ks[0],
+                          &beta,d_p1yptrs, cudap1ptrs.Ms[0], cudap1ptrs.Ms.size());
+//        for(int i=0; i<config.Ntg; i++){
+//            if(cudap1ptrs.Ms[i] != 0){
+//                hipblasgemv(cublashandle, HIPBLAS_OP_N,
+//                            cudap1ptrs.Ms[i], cudap1ptrs.Ks[i],
+//                            &alpha, (const DeviceType*)cudap1ptrs.Abp[i], cudap1ptrs.Ms[i],
+//                            (const DeviceType*)cudap1ptrs.xbp[i], 1, &beta,
+//                            cudap1ptrs.ybp[i], 1);
+//            }
+//        }
+        hipDeviceSynchronize();
+        phase2_nosplit<DeviceType>(cudap1ptrs.y, d_phase2mapping, cudap3ptrs.x,
+                                   config.granksum, stream);
+        hipDeviceSynchronize();
+        hipblasgemmbatched(cublashandle, HIPBLAS_OP_N, HIPBLAS_OP_N,
+                          cudap3ptrs.Ms[0],cudap3ptrs.Ns[0],cudap3ptrs.Ks[0],
+                          &alpha, (const DeviceType**)d_p3Aptrs, cudap3ptrs.Ms[0],
+                          (const DeviceType**)d_p3xptrs, cudap3ptrs.Ks[0],
+                          &beta,d_p3yptrs, cudap3ptrs.Ms[0], cudap3ptrs.Ms.size());
+//        for(int i=0; i<config.Mtg; i++){
+//            hipblasgemv(cublashandle, HIPBLAS_OP_N,
+//                        cudap3ptrs.Ms[i], cudap3ptrs.Ks[i],
+//                        &alpha, (const DeviceType*)cudap3ptrs.Abp[i], cudap3ptrs.Ms[i],
+//                        (const DeviceType*)cudap3ptrs.xbp[i], 1, &beta,cudap3ptrs.ybp[i], 1);
+//        }
+        hipDeviceSynchronize();
+    }
+
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType, DeviceType>::CopyBackResults()
+    {
+        // use cpu pointers to send output
+        if(transpose){
+            CopyDataB2HD(tlrmvmcpu->p1transptrs.y, (HostType*)cudap1transptrs.y, tlrmvmcpu->config.granksum);
+            CopyDataB2HD(tlrmvmcpu->p3transptrs.x, (HostType*)cudap3transptrs.x, tlrmvmcpu->config.granksum);
+            CopyDataB2HD(tlrmvmcpu->p3transptrs.y, (HostType*)cudap3transptrs.y, tlrmvmcpu->config.originM);
+        }else{
+            CopyDataB2HD(tlrmvmcpu->p1ptrs.y, (HostType*)cudap1ptrs.y, tlrmvmcpu->config.granksum);
+            CopyDataB2HD(tlrmvmcpu->p3ptrs.x, (HostType*)cudap3ptrs.x, tlrmvmcpu->config.granksum);
+            CopyDataB2HD(tlrmvmcpu->p3ptrs.y, (HostType*)cudap3ptrs.y, tlrmvmcpu->config.originM);
+        }
+        tlrmvmcpu->CopyToFinalresults();
+    }
+
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType, DeviceType>::MVM() {
+        if(transpose){
+            MVMTranspose();
+        }else{
+            MVMNoTranspose();
+        }
+    }
+
+    template<typename HostType, typename DeviceType>
+    void TlrmvmhipConstRank<HostType, DeviceType>::SetTransposeConjugate(bool transpose, bool conjugate) {
+        this->transpose = transpose;
+        this->conjugate = conjugate;
+        tlrmvmcpu->SetTransposeConjugate(transpose, conjugate);
+    }
+
+
+
+    template class TlrmvmhipConstRank<float, float>;
+    template class TlrmvmhipConstRank<double, double>;
+    template class TlrmvmhipConstRank<complex<float>, hipComplex>;
+    template class TlrmvmhipConstRank<complex<double>, hipDoubleComplex>;
+
+} // namespace cudatlrmvm
+
diff --git a/src/tlrmvm/hip/TlrmvmhipConstRank.hpp b/src/tlrmvm/hip/TlrmvmhipConstRank.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f58b8da5e7d1a083d0c3641630c7f46365056556
--- /dev/null
+++ b/src/tlrmvm/hip/TlrmvmhipConstRank.hpp
@@ -0,0 +1,100 @@
+#pragma once
+
+#include "../../common/Common.hpp"
+#include "../cpu/TlrmvmCPU.hpp"
+#include "../../common/hip/Util.hpp"
+#include "Tlrmvmhip.hpp"
+#include <cassert>
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+#include <memory>
+
+namespace hiptlrmvm
+{
+
+    template<typename HostType, typename DeviceType>
+    class TlrmvmhipConstRank
+    {
+    public:
+        explicit TlrmvmhipConstRank(TlrmvmConfig tlrmvmconfig);
+        TlrmvmhipConstRank();
+        void UpdateConfig(TlrmvmConfig &tlrmvmConfig);
+        void SetTransposeConjugate(bool transpose, bool conjugate);
+        void StreamInit(int streamsize);
+        void StreamDestroy();
+        void MemoryInit();
+        void MemoryFree();
+        void Phase1();
+        void Phase1Transpose();
+        void Phase1GetMembuffer();
+        void AllocatePhase1Buffer();
+        void Phase1CopyData();
+        void Phase1GetMembufferTranspose();
+        void AllocatePhase1BufferTranspose();
+        void Phase1CopyDataTranspose();
+        void Phase2();
+        void Phase2Transpose();
+        void Phase2Prepare();
+        void Phase2PrepareTranspose();
+        void Phase3();
+        void Phase3Transpose();
+        void Phase3GetMembuffer();
+        void AllocatePhase3Buffer();
+        void Phase3CopyData();
+        void Phase3GetMembufferTranspose();
+        void AllocatePhase3BufferTranspose();
+        void Phase3CopyDataTranspose();
+        void MVM();
+        void MVMTranspose();
+        void MVMNoTranspose();
+        void MVMGraph();
+        void MVMGraphTranspose();
+        void MVMGraphNoTranspose();
+        void setX(HostType * xvector, size_t xlength);
+        // seperate 2 functions.
+        void TryConjugateXvec();
+        void TryConjugateResults();
+        void CopyBackResults();
+
+        bool transpose;
+        bool conjugate;
+        // cpu instance
+        TlrmvmConfig config;
+        shared_ptr<TlrmvmCPU<HostType>> tlrmvmcpu;
+        // GPU resources
+        hipStream_t stream;
+        hipblasHandle_t cublashandle;
+        DeviceType alpha;
+        DeviceType beta;
+
+        // gpu pointers
+        HIPPhasePointers<DeviceType> cudap1ptrs;
+        HIPPhasePointers<DeviceType> cudap1transptrs;
+        size_t *d_phase2mapping;
+        size_t *d_phase2mapping_transpose;
+        HIPPhasePointers<DeviceType> cudap3ptrs;
+        HIPPhasePointers<DeviceType> cudap3transptrs;
+
+        DeviceType **d_p1Aptrs;
+        DeviceType **d_p1xptrs;
+        DeviceType **d_p1yptrs;
+        DeviceType **d_p3Aptrs;
+        DeviceType **d_p3xptrs;
+        DeviceType **d_p3yptrs;
+
+        DeviceType **d_p1transAptrs;
+        DeviceType **d_p1transxptrs;
+        DeviceType **d_p1transyptrs;
+        DeviceType **d_p3transAptrs;
+        DeviceType **d_p3transxptrs;
+        DeviceType **d_p3transyptrs;
+    };
+
+
+
+
+} //
+
+
+
diff --git a/src/tlrmvm/hip/hipkernel.cpp b/src/tlrmvm/hip/hipkernel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bdd78bb10f0fe6603ffb5736974a74ca9d92c7bc
--- /dev/null
+++ b/src/tlrmvm/hip/hipkernel.cpp
@@ -0,0 +1,74 @@
+//
+// Created by Yuxi Hong on 28/02/2022.
+//
+
+#include "../../common/hip/Util.hpp"
+#include "hipkernel.cuh"
+
+
+namespace hiptlrmvm {
+
+    template<typename T>
+    __global__ void phase2_nosplit_kernel(const T * __restrict__ yv,
+                                          const size_t * __restrict__ phase2mapping, T * __restrict__ yu, size_t len)
+    {
+        size_t thread_x = blockDim.x * blockIdx.x + threadIdx.x;
+        if (thread_x < len){
+            yu[phase2mapping[thread_x]] = yv[thread_x];
+        }
+    }
+
+    template<typename T>
+    void phase2_nosplit(const T *yv, const size_t * phase2mapping, T * yu, size_t len, hipStream_t stream){
+        int dimx = 512;
+        int griddimx = (len+dimx-1) / dimx;
+        phase2_nosplit_kernel<<<griddimx, dimx, 0, stream>>>(yv, phase2mapping, yu, len);
+        HIPCHECK(hipGetLastError());
+    }
+
+    template void phase2_nosplit<float>(const float*, const size_t *, float *, size_t, hipStream_t);
+    template void phase2_nosplit<double>(const double*, const size_t *, double *, size_t, hipStream_t);
+    template void phase2_nosplit<hipDoubleComplex>(const hipDoubleComplex*, const size_t *,
+            hipDoubleComplex *, size_t, hipStream_t);
+    template void phase2_nosplit<hipComplex>(const hipComplex*, const size_t *, hipComplex *, size_t, hipStream_t);
+//    template void phase2_nosplit<cuHalfComplex>(const cuHalfComplex*, const size_t *, cuHalfComplex *, size_t, hipStream_t);
+
+
+    __forceinline__ __device__ float conj(float Invec){
+        return Invec;
+    }
+    __forceinline__ __device__ double conj(double Invec){
+        return Invec;
+    }
+    __forceinline__ __device__ hipComplex conj(hipComplex Invec){
+        return {Invec.x, -Invec.y};
+    }
+    __forceinline__ __device__ hipDoubleComplex conj(hipDoubleComplex Invec){
+        return {Invec.x, -Invec.y};
+    }
+
+    template<typename T>
+    __global__ void ConjugateKernel(T *Invec, size_t length)
+    {
+        size_t thread_x = blockDim.x * blockIdx.x + threadIdx.x;
+        if (thread_x < length){
+            Invec[thread_x] = conj(Invec[thread_x]);
+        }
+    }
+
+    template<typename T>
+    void ConjugateDriver(T *Invec, size_t length, hipStream_t stream){
+        int dimx = 512;
+        int griddimx = (length+dimx-1) / dimx;
+        ConjugateKernel<<<griddimx, dimx, 0, stream>>>(Invec, length);
+        HIPCHECK(hipGetLastError());
+    }
+    template void ConjugateDriver<float>(float *Invec, size_t length, hipStream_t stream);
+    template void ConjugateDriver<double>(double *Invec, size_t length, hipStream_t stream);
+
+    template void ConjugateDriver<hipComplex>(hipComplex *Invec, size_t length, hipStream_t stream);
+    template void ConjugateDriver<hipDoubleComplex>(hipDoubleComplex *Invec, size_t length, hipStream_t stream);
+
+
+} // namespace
+
diff --git a/src/tlrmvm/hip/hipkernel.cuh b/src/tlrmvm/hip/hipkernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..0549467f9edc22b4b935fcdc26688bde18ca75c5
--- /dev/null
+++ b/src/tlrmvm/hip/hipkernel.cuh
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <iostream>
+#include <complex>
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+
+
+namespace hiptlrmvm{
+
+    // normal phase 2
+    template<typename T>
+    void phase2_nosplit(const T *yv, const size_t * phase2mapping, T * yu, size_t len, hipStream_t stream);
+
+    // in-place conjugate convert
+    template<typename T>
+    void ConjugateDriver(T *Invec, size_t length, hipStream_t stream);
+
+
+
+} // namespace
diff --git a/src/tlrmvm/hip/tlrmvmhiputil.cpp b/src/tlrmvm/hip/tlrmvmhiputil.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d4c9ca0c049ac5d13b1018b29fb97a31d8380df4
--- /dev/null
+++ b/src/tlrmvm/hip/tlrmvmhiputil.cpp
@@ -0,0 +1,91 @@
+//
+// Created by Yuxi Hong on 08/04/2022.
+//
+
+#include "tlrmvmhiputil.hpp"
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+#include <hipblas.h>
+
+namespace hiptlrmvm{
+
+    SingleGraph::SingleGraph() {}
+
+    void SingleGraph::StreamInit(int streamsize) {
+        // single graph creation
+        this->streamsize = streamsize;
+        HIPCHECK(hipEventCreate(&event_start));
+        HIPCHECK(hipEventCreate(&event_phase2finish));
+        HIPCHECK(hipEventCreate(&event_phase1finish));
+        events = new hipEvent_t[4*streamsize];
+        for(int i=0; i<4*streamsize; i++) HIPCHECK(hipEventCreate(&events[i]));
+        graphCreated = false;
+    }
+
+    void SingleGraph::syncallstreams(hipEvent_t *eventsptr, hipStream_t *streamptr, int stream_size) {
+        for(int streami=1; streami < stream_size; streami++){
+            hipEventRecord(eventsptr[streami], streamptr[streami]);
+        }
+        for(int streami=1; streami < stream_size; streami++){
+            hipStreamWaitEvent(streamptr[0], eventsptr[streami],0);
+        }
+        hipEventRecord(eventsptr[0], streamptr[0]);
+        for(int streami=1; streami < stream_size; streami++){
+            hipStreamWaitEvent(streamptr[streami], eventsptr[0],0);
+        }
+    }
+
+    void SingleGraph::syncstream0(hipEvent_t *eventsptr, hipStream_t *streamptr, int stream_size) {
+        for(int streami=1; streami < stream_size; streami++){
+            hipEventRecord(eventsptr[streami], streamptr[streami]);
+        }
+        for(int streami=1; streami < stream_size; streami++){
+            hipStreamWaitEvent(streamptr[0], eventsptr[streami],0);
+        }
+    }
+
+    void SingleGraph::syncotherstreams(hipEvent_t event, hipStream_t * streamptr, int stream_size){
+        hipEventRecord(event, streamptr[0]);
+        for(int streami=1; streami<stream_size; streami++){
+            hipStreamWaitEvent(streamptr[streami], event,0);
+        }
+    }
+
+    MultiGraph::MultiGraph() {}
+
+    void MultiGraph::StreamInit(int batchsize, int streamsize) {
+        this->batchsize = batchsize;
+        this->streamsize = streamsize;
+        // multi graph creation
+        event_start.resize(batchsize);
+        event_phase2finish.resize(batchsize);
+        graphCreated.resize(batchsize);
+        instance.resize(batchsize);
+        graph.resize(batchsize);
+        events = new hipEvent_t*[batchsize];
+        for(int bi=0; bi<batchsize; bi++){
+            HIPCHECK(hipEventCreate(&event_start[bi]));
+            HIPCHECK(hipEventCreate(&event_phase2finish[bi]));
+            events[bi] = new hipEvent_t[4*streamsize];
+            for(int i=0; i<4*streamsize; i++) HIPCHECK(hipEventCreate(&events[bi][i]));
+            graphCreated[bi] = false;
+        }
+    }
+
+    CUDAI8basesPointers::CUDAI8basesPointers() {}
+    CUDAI8XPointers::CUDAI8XPointers(){}
+
+    CBMaxInfo::CBMaxInfo() {maxA=maxx=maxy=maxbatchsize=0;}
+
+    void getcomplexvectormax(complex<float> *hy, size_t xlength){
+        double rmax = 0;
+        double imax = 0;
+        for(int i=0; i<xlength; i++){
+            rmax = fmax(rmax, abs(hy[i].real()));
+            imax = fmax(imax, abs(hy[i].imag()));
+        }
+        cout << "rmax " << rmax << ", imax " << imax << endl;
+    }
+
+}
+
diff --git a/src/tlrmvm/hip/tlrmvmhiputil.hpp b/src/tlrmvm/hip/tlrmvmhiputil.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d3bb82713f48c1595641b30d0fb30e773a0e9dc8
--- /dev/null
+++ b/src/tlrmvm/hip/tlrmvmhiputil.hpp
@@ -0,0 +1,93 @@
+#pragma once
+
+#include <vector>
+using std::vector;
+
+#include "../cpu/TlrmvmCPU.hpp"
+
+#ifdef USE_MPI
+#include <mpi.h>
+#endif
+
+namespace hiptlrmvm {
+
+    struct SingleGraph{
+        SingleGraph();
+        void StreamInit(int streamsize);
+        int streamsize;
+        hipGraph_t graph;
+        bool graphCreated;
+        hipGraphExec_t instance;
+        hipEvent_t* events;
+        hipEvent_t event_start;
+        hipEvent_t event_phase1finish;
+        hipEvent_t event_phase2finish;
+        void syncallstreams(hipEvent_t * events, hipStream_t * stream,int streamsize);
+        void syncstream0(hipEvent_t * events, hipStream_t * stream,int streamsize);
+        void syncotherstreams(hipEvent_t event, hipStream_t * stream,int streamsize);
+    };
+
+    struct MultiGraph{
+        MultiGraph();
+        void StreamInit(int batchsize, int streamsize);
+        int batchsize;
+        int streamsize;
+        vector<hipGraph_t> graph;
+        vector<bool> graphCreated;
+        vector<hipGraphExec_t> instance;
+        hipEvent_t* *events;
+        vector<hipEvent_t> event_start;
+        vector<hipEvent_t> event_phase2finish;
+    };
+
+    // BatchTlrmvmcudaINT8
+
+    struct CUDAI8basesPointers{
+        CUDAI8basesPointers();
+        size_t Acnt;
+        size_t Xcnt;
+        size_t Ycnt;
+        vector<size_t> Ms;
+        vector<size_t> Ks;
+        vector<size_t> Ns;
+
+        hipInt8Complex * Abuffer; // real data buffer
+        vector<hipComplex> maxA;
+        hipComplex * maxA_device; // used to scale up to fp16
+        vector<size_t> Aelems; // each gemv A elems
+        vector<size_t> Aelemsoffset; // each gemv A elems, prefix
+        size_t * Aelemsoffset_device; // used to scale up to fp16
+        hipHalfComplex * ybuffer; // y buffer, alway a half buffer
+
+        vector<size_t> xelems; // each gemv x elems
+        vector<size_t> xelemsoffset; // each gemv x elems, prefix
+
+        vector<size_t> yelems; // each gemv y elems
+        vector<size_t> yelemsoffset; // each gemv y elems, prefix
+
+    };
+
+    struct CUDAI8XPointers{
+        CUDAI8XPointers();
+        hipInt8Complex * xbuffer;
+        vector<hipComplex> maxx;
+        hipComplex * maxx_device; // used to scale up to fp16
+        vector<size_t> xelems; // each gemv x elems
+        size_t* xelems_device; // each gemv x elems
+        vector<size_t> xelemsoffset; // each gemv x elems, prefix
+        size_t* xelemsoffset_device; // each gemv x elems, prefix
+        hipComplex *p3xreductionbuffer_device;
+    };
+
+    struct CBMaxInfo{
+        CBMaxInfo();
+        size_t maxA;
+        size_t maxx;
+        size_t maxy;
+        size_t maxbatchsize;
+    };
+
+    void getcomplexvectormax(complex<float> *hy, size_t xlength);
+
+}
+
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 476d37e6804c5179fc87ba745cd3401a23cbcda4..5893bff94bee4d6360f8f9cb17d1bdcba882012b 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -44,3 +44,20 @@ if(BUILD_CUDA)
         install(TARGETS ${cbins} DESTINATION test)
     endforeach()
 endif()
+
+if(BUILD_HIP)
+
+    set(BINS
+            Test_hip_hipblas
+            Test_hip_tlrmvm
+            Test_hip_constrank
+#            Test_hip_tlrmvmgraph
+            )
+    foreach(cbins ${BINS})
+        WrapHIPBinary(${cbins} hip)
+        install(TARGETS ${cbins} DESTINATION test)
+    endforeach()
+#    install(FILES
+#            ${CMAKE_CURRENT_LIST_DIR}/hip/Test_hip_tlrmvm_correctness.sh
+#            DESTINATION test PERMISSIONS OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE)
+endif()
\ No newline at end of file
diff --git a/test/cpp/ex2mpitlrmvm_complexfloat.cpp b/test/cpp/ex2mpitlrmvm_complexfloat.cpp
index edb150dbfec609640799f45fa9bbbc583b42c6a6..902b25fb0acf36084f5762af96f55d0d7fbb2da9 100644
--- a/test/cpp/ex2mpitlrmvm_complexfloat.cpp
+++ b/test/cpp/ex2mpitlrmvm_complexfloat.cpp
@@ -1,17 +1,17 @@
 #include <string>
 #include <vector>
 #include <chrono>
-#include <memory.h>
 #include <algorithm>
 #include <mpi.h>
-
 #include <common/Common.hpp>
 #include <tlrmvm/Tlrmvm.hpp>
+#define real complex<float>
 using namespace std;
 int main (int argc, char ** argv){
     int originM;
     int originN;
     int nb;
+    int loopsize;
     string acc;
     string datafolder;
     string problemname;
@@ -22,7 +22,6 @@ int main (int argc, char ** argv){
     vector<double> bandstat;
     double bytesprocessed;
     size_t granksum;
-    int loopsize;
     auto argparser = ArgsParser(argc, argv);
     originM = argparser.getint("M");
     originN = argparser.getint("N");
@@ -44,18 +43,17 @@ int main (int argc, char ** argv){
     maskmat.Fill(0);
     for(int i=0; i<tlrmvmconfig.Mtg; i++){
         for(int j=0; j<tlrmvmconfig.Ntg; j++){
-            if (j % size == rank )
+            if (j % size == rank)
             maskmat.SetElem(i,j,1);
         }
     }
     tlrmvmconfig.UpdateMaskmat(maskmat);
-    TlrmvmCPU<complex<float>> tlrmvmptr(tlrmvmconfig);
-    auto finalbuffer = new complex<float>[tlrmvmptr.config.paddingM];
-//    tlrmvmptr.xmat.Fill(0.001);
-    memset(finalbuffer, 0, sizeof(complex<float>) * tlrmvmptr.config.paddingM);
+    TlrmvmCPU<real> tlrmvmptr(tlrmvmconfig);
+    auto finalbuffer = new real[tlrmvmptr.config.paddingM];
+    memset(finalbuffer, 0, sizeof(real) * tlrmvmptr.config.paddingM);
     tlrmvmptr.MemoryInit();
-    auto curx = Matrix<complex<float>>(tlrmvmptr.config.originN, 1);
-    curx.Fill(complex<float>(0.1,1.0));
+    auto curx = Matrix<real>(tlrmvmptr.config.originN, 1);
+    curx.Fill(real(0.1,1.0));
     tlrmvmptr.setX(curx.RawPtr(), curx.Shape()[0]);
     for(int i=0; i<loopsize; i++){
         MPI_Barrier(MPI_COMM_WORLD);
@@ -75,32 +73,30 @@ int main (int argc, char ** argv){
                   timestat.size(), MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
     if(rank == 0){
 #ifndef USE_NEC
-        CFPPCMatrix seismicpcmat(datafolder, acc, nb, problemname, originM, originN);
-        seismicpcmat.setX(tlrmvmptr.xmat);
-        seismicpcmat.GetDense();
-        Matrix<complex<float>> yv_pc = seismicpcmat.Phase1();
-        auto hyv = Matrix<complex<float>>(tlrmvmptr.p1ptrs.y, tlrmvmptr.config.workmatgranksum, 1);
+        CFPPCMatrix pcmat(datafolder, acc, nb, problemname, originM, originN);
+        pcmat.setX(tlrmvmptr.xmat);
+        pcmat.GetDense();
+        Matrix<real> yv_pc = pcmat.Phase1();
+        auto hyv = Matrix<real>(tlrmvmptr.p1ptrs.y, tlrmvmptr.config.workmatgranksum, 1);
 //        cout << " Phase 1 Correctness : " << hyv.allclose(yv_pc) << endl;
-        Matrix<complex<float>> yu_pc = seismicpcmat.Phase2();
-        auto hyu = Matrix<complex<float>>(tlrmvmptr.p3ptrs.x, tlrmvmptr.config.workmatgranksum, 1);
+        Matrix<real> yu_pc = pcmat.Phase2();
+        auto hyu = Matrix<real>(tlrmvmptr.p3ptrs.x, tlrmvmptr.config.workmatgranksum, 1);
 //        cout << " Phase 2 Correctness : " << hyu.allclose(yu_pc) << endl;
-        Matrix<complex<float>> y_pc = seismicpcmat.Phase3();
-        auto hy = Matrix<complex<float>>(finalbuffer, tlrmvmptr.config.originM, 1);
-        auto denseout = seismicpcmat.GetDense() * tlrmvmptr.xmat;
+        Matrix<real> y_pc = pcmat.Phase3();
+        auto hy = Matrix<real>(finalbuffer, tlrmvmptr.config.originM, 1);
+        auto denseout = pcmat.GetDense() * tlrmvmptr.xmat;
         cout << " Check MPI Phase 3 Correctness : "<< hy.allclose(denseout) << endl;
 #endif
         std::sort(mergetime.begin(), mergetime.end());
         int N = mergetime.size();
         cout << "median " << mergetime[N / 2] * 1e6 << " us."<< endl;
-        double bytes = TLRMVMBytesProcessed<complex<float>>(tlrmvmptr.config.granksum,
+        double bytes = TLRMVMBytesProcessed<real>(tlrmvmptr.config.granksum,
                                                    tlrmvmptr.config.nb, originM, originN);
         cout << "U and V bases size: " << bytes * 1e-6 << " MB." << endl;
         cout << "Bandwidth " << bytes / mergetime[N/2] * 1e-9 << " GB/s" << endl;
     }
-    delete[] finalbuffer;
     tlrmvmptr.MemoryFree();
+    delete[] finalbuffer;
     MPI_Finalize();
     return 0;
 }
-
-
diff --git a/test/cpp/ex2mpitlrmvm_float.cpp b/test/cpp/ex2mpitlrmvm_float.cpp
index 5685e0af5846ca72887cc91d31e99ef2a86d46dd..aad825c8e4b32e2f421550af7f773659f07dd784 100644
--- a/test/cpp/ex2mpitlrmvm_float.cpp
+++ b/test/cpp/ex2mpitlrmvm_float.cpp
@@ -1,11 +1,11 @@
 #include <string>
 #include <vector>
 #include <chrono>
-
 #include <algorithm>
 #include <mpi.h>
 #include <common/Common.hpp>
 #include <tlrmvm/Tlrmvm.hpp>
+#define real float
 using namespace std;
 int main (int argc, char ** argv){
     int originM;
@@ -27,10 +27,10 @@ int main (int argc, char ** argv){
     originN = argparser.getint("N");
     nb = argparser.getint("nb");
     loopsize = argparser.getint("loopsize");
-    acc = argparser.getstring("errorthreshold");
-    problemname = argparser.getstring("problemname");
+    acc = argparser.getstring("threshold");
+    problemname = argparser.getstring("problem");
     datafolder = argparser.getstring("datafolder");
-    char rpath[100]; 
+    char rpath[300];
     sprintf(rpath, "%s/%s_Rmat_nb%d_acc%s.bin", datafolder.c_str(), problemname.c_str(), nb, acc.c_str());
     rankfile = string(rpath);
     TlrmvmConfig tlrmvmconfig(originM, originN, nb, datafolder, acc, problemname);
@@ -47,21 +47,22 @@ int main (int argc, char ** argv){
             maskmat.SetElem(i,j,1);
         }
     }
-    tlrmvmconfig.Maskmat = maskmat;
-    TlrmvmCPU<float> tlrmvmptr(tlrmvmconfig);
-    double bytes = TLRMVMBytesProcessed<float>(tlrmvmptr.config.granksum,
-                                               tlrmvmptr.config.nb, tlrmvmptr.config.paddingM,
-                                               tlrmvmptr.config.paddingN);
+    tlrmvmconfig.UpdateMaskmat(maskmat);
+    TlrmvmCPU<real> tlrmvmptr(tlrmvmconfig);
+    auto finalbuffer = new real[tlrmvmptr.config.paddingM];
+    memset(finalbuffer, 0, sizeof(real) * tlrmvmptr.config.paddingM);
     tlrmvmptr.MemoryInit();
-    auto finalbuffer = new float[tlrmvmptr.config.originM];
-    for(int i=0; i<tlrmvmptr.config.originM; i++) finalbuffer[i] = 0.0;
+    auto curx = Matrix<real>(tlrmvmptr.config.originN, 1);
+    curx.Fill(0.1);
+    tlrmvmptr.setX(curx.RawPtr(), curx.Shape()[0]);
     for(int i=0; i<loopsize; i++){
         MPI_Barrier(MPI_COMM_WORLD);
         auto start = std::chrono::steady_clock::now();
         tlrmvmptr.MVM();
+        tlrmvmptr.CopyToFinalresults();
         MPI_Barrier(MPI_COMM_WORLD);
         MPI_Reduce(tlrmvmptr.finalresults,
-                   finalbuffer, tlrmvmptr.config.originM,
+                   finalbuffer, tlrmvmptr.config.paddingM,
                    MPI_FLOAT, MPI_SUM, 0, MPI_COMM_WORLD);
         auto end = std::chrono::steady_clock::now();
         auto elapsed_time = std::chrono::duration_cast<std::chrono::microseconds>(end - start).count();
@@ -71,22 +72,25 @@ int main (int argc, char ** argv){
     MPI_Allreduce(timestat.data(), mergetime.data(),
                   timestat.size(), MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD);
     if(rank == 0){
-        FPPCMatrix seismicpcmat(datafolder, acc, nb, problemname, originM, originN);
-        seismicpcmat.setX(tlrmvmptr.xmat);
-        seismicpcmat.GetDense();
-        Matrix<float> yv_pc = seismicpcmat.Phase1();
-        auto hyv = Matrix<float>(tlrmvmptr.p1ptrs.y, tlrmvmptr.config.workmatgranksum, 1);
-        // cout << " Phase 1 Correctness : " << hyv.allclose(yv_pc) << endl;
-        Matrix<float> yu_pc = seismicpcmat.Phase2();
-        auto hyu = Matrix<float>(tlrmvmptr.p3ptrs.x, tlrmvmptr.config.workmatgranksum, 1);
-        // cout << " Phase 2 Correctness : " << hyu.allclose(yu_pc) << endl;
-        Matrix<float> y_pc = seismicpcmat.Phase3();
-        auto hy = Matrix<float>(tlrmvmptr.p3ptrs.y, tlrmvmptr.config.originM, 1);
-        cout << " Check MPI Phase 3 Correctness : "<< hy.allclose(y_pc) << endl;
+#ifndef USE_NEC
+        FPPCMatrix pcmat(datafolder, acc, nb, problemname, originM, originN);
+        pcmat.setX(tlrmvmptr.xmat);
+        pcmat.GetDense();
+        Matrix<real> yv_pc = pcmat.Phase1();
+        auto hyv = Matrix<real>(tlrmvmptr.p1ptrs.y, tlrmvmptr.config.workmatgranksum, 1);
+//        cout << " Phase 1 Correctness : " << hyv.allclose(yv_pc) << endl;
+        Matrix<real> yu_pc = pcmat.Phase2();
+        auto hyu = Matrix<real>(tlrmvmptr.p3ptrs.x, tlrmvmptr.config.workmatgranksum, 1);
+//        cout << " Phase 2 Correctness : " << hyu.allclose(yu_pc) << endl;
+        Matrix<real> y_pc = pcmat.Phase3();
+        auto hy = Matrix<real>(finalbuffer, tlrmvmptr.config.originM, 1);
+        auto denseout = pcmat.GetDense() * tlrmvmptr.xmat;
+        cout << " Check MPI Phase 3 Correctness : "<< hy.allclose(denseout) << endl;
+#endif
         std::sort(mergetime.begin(), mergetime.end());
         int N = mergetime.size();
         cout << "median " << mergetime[N / 2] * 1e6 << " us."<< endl;
-        double bytes = TLRMVMBytesProcessed<float>(tlrmvmptr.config.granksum,
+        double bytes = TLRMVMBytesProcessed<real>(tlrmvmptr.config.granksum,
                                                    tlrmvmptr.config.nb, originM, originN);
         cout << "U and V bases size: " << bytes * 1e-6 << " MB." << endl;
         cout << "Bandwidth " << bytes / mergetime[N/2] * 1e-9 << " GB/s" << endl;
@@ -96,5 +100,3 @@ int main (int argc, char ** argv){
     MPI_Finalize();
     return 0;
 }
-
-
diff --git a/test/hip/Test_hip_constrank.cpp b/test/hip/Test_hip_constrank.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f094486e1afa86f97cd6efe9677615f21625c6a9
--- /dev/null
+++ b/test/hip/Test_hip_constrank.cpp
@@ -0,0 +1,58 @@
+#include <iostream>
+#include <unistd.h>
+#include <memory.h>
+#include "common/Common.hpp"
+#include "tlrmvm/Tlrmvm.hpp"
+
+using namespace hiptlrmvm;
+
+int main (int argc, char ** argv){
+    auto argparser = ArgsParser(argc, argv);
+    auto originM = argparser.getint("M");
+    auto originN = argparser.getint("N");
+    auto nb = argparser.getint("nb");
+    auto ranksize = argparser.getint("ranksize");
+    auto loopsize = argparser.getint("loopsize");
+
+    // rank size should be smaller than nb.
+    TlrmvmConfig tlrmvmconfig(originM, originN, nb, ranksize);
+    /********************************
+     * cuda instance
+     ********************************/
+    TlrmvmhipConstRank<complex<float>, hipComplex> cudatlrmvmptr(tlrmvmconfig);
+    cudatlrmvmptr.StreamInit(0);
+    cudatlrmvmptr.MemoryInit();
+    cudatlrmvmptr.SetTransposeConjugate(false, false);
+    cudatlrmvmptr.TryConjugateXvec();
+
+    // time
+    hipEvent_t start;
+    hipEvent_t stop;
+    hipEventCreate(&start);
+    hipEventCreate(&stop);
+    vector<double> rawtime;
+    float milliseconds = 0;
+
+    for(int i=0; i<loopsize; i++){
+        hipEventRecord(start);
+        // do the computation and send results back to cpu instance.
+        cudatlrmvmptr.MVM();
+        hipEventRecord(stop);
+        hipEventSynchronize(stop);
+        hipEventElapsedTime(&milliseconds, start, stop);
+        rawtime.push_back(milliseconds * 1e-3);
+    }
+
+    cudatlrmvmptr.TryConjugateResults();
+    cudatlrmvmptr.CopyBackResults();
+
+    std::sort(rawtime.begin(), rawtime.end());
+    int nruns = rawtime.size();
+    cout << "Median Time " << rawtime[nruns/2] * 1e6  << " us."<< endl;
+    double bytes = TLRMVMBytesProcessed<complex<float>>(cudatlrmvmptr.config.granksum, nb,
+                                                        originM, originN);
+    cout << "Bandwidth: " << bytes / rawtime[nruns/2] * 1e-9 << " GB/s." << endl;
+
+    cudatlrmvmptr.MemoryFree();
+    return 0;
+}
diff --git a/test/hip/Test_hip_hipblas.cpp b/test/hip/Test_hip_hipblas.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b06a1f3de013f2c13cf1b9e93499471638c85257
--- /dev/null
+++ b/test/hip/Test_hip_hipblas.cpp
@@ -0,0 +1,20 @@
+#include <hipblas.h>
+#include <iostream>
+#include <hip/hip_runtime.h>
+using namespace std;
+int main(){
+    hipEvent_t start;
+    hipEvent_t stop;
+    hipEventCreate(&start);
+    hipEventCreate(&stop);
+    hipEventRecord(start);
+    for(int i=0; i<10; i++){
+        cout << "hello" << endl;
+    }
+    hipEventRecord(stop);
+    hipEventSynchronize(stop);
+    float milliseconds = 0;
+    hipEventElapsedTime(&milliseconds, start, stop);
+    cout << "time "<< milliseconds << endl;
+
+}
diff --git a/test/hip/Test_hip_tlrmvm.cpp b/test/hip/Test_hip_tlrmvm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7398ad26325de513e9d666b5b8d7c03f89981ccf
--- /dev/null
+++ b/test/hip/Test_hip_tlrmvm.cpp
@@ -0,0 +1,52 @@
+#include <iostream>
+#include <unistd.h>
+#include <memory.h>
+#include "common/Common.hpp"
+#include "tlrmvm/Tlrmvm.hpp"
+
+using namespace hiptlrmvm;
+
+int main (int argc, char ** argv){
+    auto argparser = ArgsParser(argc, argv);
+    auto originM = argparser.getint("M");
+    auto originN = argparser.getint("N");
+    auto nb = argparser.getint("nb");
+    auto threshold = argparser.getstring("threshold");
+    auto problem = argparser.getstring("problem");
+    auto datafolder = argparser.getstring("datafolder");
+    auto streams = argparser.getint("streams");
+    TlrmvmConfig tlrmvmconfig(originM, originN, nb, datafolder, threshold, problem);
+    /********************************
+     * cuda instance
+     ********************************/
+    Tlrmvmhip<complex<float>, hipComplex> cudaptr(tlrmvmconfig);
+    cudaptr.StreamInit(streams);
+    cudaptr.MemoryInit();
+
+    // do the computation and send results back to cpu instance.
+    cudaptr.Phase1();
+    cudaptr.Phase2();
+    cudaptr.Phase3();
+    cudaptr.CopyBackResults();
+
+    CFPPCMatrix seismicpcmat(datafolder, threshold, nb, problem, originM, originN);
+    auto tlrmvmcpu = cudaptr.tlrmvmcpu;
+    seismicpcmat.setX(cudaptr.tlrmvmcpu->xmat);
+    auto densemat = seismicpcmat.GetDense();
+    Matrix<complex<float>> yv_pc = seismicpcmat.Phase1();
+    auto hyv = Matrix<complex<float>>(tlrmvmcpu->p1ptrs.y, tlrmvmcpu->config.workmatgranksum, 1);
+    cout << "====================================================" << endl;
+    cout << "Test TLR-MVM CUDA Implementation. " << endl;
+    cout << " Phase 1 Correctness : " << hyv.allclose(yv_pc) << endl;
+    Matrix<complex<float>> yu_pc = seismicpcmat.Phase2();
+    auto hyu = Matrix<complex<float>>(tlrmvmcpu->p3ptrs.x, tlrmvmcpu->config.workmatgranksum, 1);
+    cout << " Phase 2 Correctness : " << hyu.allclose(yu_pc) << endl;
+    Matrix<complex<float>> y_pc = seismicpcmat.Phase3();
+    auto hy = Matrix<complex<float>>(tlrmvmcpu->p3ptrs.y, tlrmvmcpu->config.paddingM, 1);
+    cout << " Phase 3 Correctness : "<< hy.allclose(y_pc) << endl;
+    auto denseout = densemat * cudaptr.tlrmvmcpu->xmat;
+    cout << "dense results vs tlrmvm results " << hy.allclose(denseout) << endl;
+    cout << "====================================================" << endl;
+    cudaptr.MemoryFree();
+    return 0;
+}
diff --git a/test/hip/Test_hip_tlrmvm_singlecall.cpp b/test/hip/Test_hip_tlrmvm_singlecall.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..261877f9c9b0b4a5717eccd494f94e260907a094
--- /dev/null
+++ b/test/hip/Test_hip_tlrmvm_singlecall.cpp
@@ -0,0 +1,44 @@
+#include <iostream>
+#include <unistd.h>
+#include <memory.h>
+#include "common/Common.hpp"
+#include "tlrmvm/Tlrmvm.hpp"
+
+using namespace hiptlrmvm;
+
+int main (int argc, char ** argv){
+    auto argparser = ArgsParser(argc, argv);
+    auto originM = argparser.getint("M");
+    auto originN = argparser.getint("N");
+    auto nb = argparser.getint("nb");
+    auto threshold = argparser.getstring("threshold");
+    auto problem = argparser.getstring("problem");
+    auto datafolder = argparser.getstring("datafolder");
+    auto streams = argparser.getint("streams");
+    TlrmvmConfig tlrmvmconfig(originM, originN, nb, datafolder, threshold, problem);
+    /********************************
+     * cuda instance
+     ********************************/
+    Tlrmvmhip<complex<float>, hipComplex> cudatlrmvmptr(tlrmvmconfig);
+    cudatlrmvmptr.StreamInit(streams);
+    cudatlrmvmptr.MemoryInit();
+    cudatlrmvmptr.SetTransposeConjugate(false, false);
+    cudatlrmvmptr.TryConjugateXvec();
+
+    // do the computation and send results back to cpu instance.
+    cudatlrmvmptr.MVM();
+
+    cudatlrmvmptr.TryConjugateResults();
+    cudatlrmvmptr.CopyBackResults();
+
+    CFPPCMatrix seismicpcmat(datafolder, threshold, nb, problem, originM, originN);
+    auto densemat = seismicpcmat.GetDense();
+    auto hy = Matrix<complex<float>>(cudatlrmvmptr.tlrmvmcpu->finalresults, cudatlrmvmptr.tlrmvmcpu->config.originM, 1);
+    auto denseout = densemat * cudatlrmvmptr.tlrmvmcpu->xmat;
+    cout << "====================================================" << endl;
+    cout << "Test TLR-MVM conjugate single call Implementation. " << endl;
+    cout << "dense results vs tlrmvm results " << hy.allclose(denseout) << endl;
+    cout << "====================================================" << endl;
+    cudatlrmvmptr.MemoryFree();
+    return 0;
+}
diff --git a/thirdparty/pybind11 b/thirdparty/pybind11
deleted file mode 160000
index 45f792efdd92da094548e2095d6efdbfa7e536ee..0000000000000000000000000000000000000000
--- a/thirdparty/pybind11
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 45f792efdd92da094548e2095d6efdbfa7e536ee