Peter Steinbach
steinbach@scionics.de
service provider to the Max Planck Institute of Molecular Cell Biology and Genetics
What should our clients choose?
Share experiences with ROCm/ROCr with community!Accelerating our clients' scientific algorithms on GPUs
(multi-GB dataset, a lot of FFTs)
Use the issue tracker of this talk to correct me!
github.com/psteinb/parallel2017ROCm
Porting Code from CUDA
supported CPUs:
hcc compiler for supported APIs:
GCN ISA assembler and disassembler
/* copy */ c[:] = a[:]
/* mul */ b[:] = scalar*b[:]
/* add */ c[:] = a[:] + b[:]
/* triad */ a[:] = b[:] + scalar*c[:]
/* dot */ scalar = dot(a[:],b[:])
hipify
supports most commonly used parts of CUDA:
streams, events, memory (de-)allocation, profiling
__global__ void add_kernel(const T * a,
const T * b,
T * c){
const int i = blockDim.x * blockIdx.x + threadIdx.x;
c[i] = a[i] + b[i];}
void CUDAStream<T>::add(){
add_kernel<<<array_size/TBSIZE, TBSIZE>>>(d_a, d_b, d_c);
check_error(); //..
}
__global__ void add_kernel(hipLaunchParm lp,
const T * a, const T * b,
T * c){
const int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
c[i] = a[i] + b[i];
}
void HIPStream<T>::add(){
hipLaunchKernel(HIP_KERNEL_NAME(add_kernel),
dim3(array_size/TBSIZE), dim3(TBSIZE), 0, 0,
d_a, d_b, d_c); check_error(); //...
}
hc
namespace plus C++14Very similar to thrust, boost.compute, sycl.
#include "Stream.h"
#include "hc.hpp"
template <class T>
class HCStream : public Stream<T>
{
protected:
unsigned int array_size;
hc::array<T,1> d_a;
hc::array<T,1> d_b;
hc::array<T,1> d_c;
//...
template <class T>
void HCStream<T>::init_arrays(T _a, T _b, T _c)
{
hc::array_view<T,1> view_a(this->d_a);
hc::parallel_for_each(hc::extent<1>(array_size)
, [=](hc::index<1> i) [[hc]] {
view_a[i] = _a;
});
//...
template <class T>
void HCStream<T>::add()
{
hc::array_view<T,1> view_a(this->d_a);
hc::array_view<T,1> view_b(this->d_b);
hc::array_view<T,1> view_c(this->d_c);
hc::parallel_for_each(hc::extent<1>(array_size)
, [=](hc::index<1> i) [[hc]] {
view_c[i] = view_a[i]+view_b[i];
});
hc::parallel_for_each(tiled_ex,
[=,
&view_a,
&view_b,
&partial](const hc::tiled_index<1>& tidx) [[hc]] {
auto gidx = tidx.global[0];
T r = T{0}; // Assumes reduction op is addition.
while (gidx < view_a.get_extent().size()) {
r += view_a[gidx] * view_b[gidx]; //dot-product
gidx += domain_sz;
}
tile_static T tileData[TBSIZE];
tileData[tidx.local[0]] = r;
tidx.barrier.wait_with_tile_static_memory_fence();
for (auto h = TBSIZE / 2; h; h /= 2) {
if (tidx.local[0] < h) {
tileData[tidx.local[0]] += tileData[tidx.local[0] + h];
}
tidx.barrier.wait_with_tile_static_memory_fence();
}
if (tidx.global == tidx.tile_origin) partial[tidx.tile] = tileData[0];
std::vector<float> payload (/*pick a number*/);
hc::array<float,1> d_payload(payload.size());
hc::completion_future when_done = hc::async_copy(payload.begin(),
payload.end(),
d_payload);
when_done.then(call_kernel_functor); //continuation function!
template<typename functor >
void then (const functor &func);//just a callback for now
std::vector<hc::completion_future> streams(n);
for(hc::completion_future when_done : streams){
when_done = hc::async_copy(payload_begin_itr,
payload_end_itr,
d_payload_view);
when_done.then(parallel_for_each(/*do magic*/))
.then(hc::async_copy(d_payload_view,result_begin_itr));
}
hc::when_all(streams);
concurrency constructs are the glue code of host-device interactions!
(see when_all, co_await and friends)hc
API is expressive and reduces boiler-plate code (plans for C++17 on device)