How to use clang to compile OpenCL to ptx code?

后端 未结 3 638
北恋
北恋 2021-02-02 00:31

Clang 3.0 is able to compile OpenCL to ptx and use Nvidia\'s tool to launch the ptx code on GPU. How can I do this? Please be specific.

3条回答
  •  隐瞒了意图╮
    2021-02-02 00:59

    Here is brief guide how to do it with Clang trunk (3.4 at this point) and libclc. I assume you have basic knowledge how to configure and compile LLVM and Clang, so I just listed the configure flags I have used.

    square.cl:

    __kernel void vector_square(__global float4* input,  __global float4* output) {
      int i = get_global_id(0);
      output[i] = input[i]*input[i];
    }
    
    1. Compile llvm and clang with nvptx support:

      ../llvm-trunk/configure --prefix=$PWD/../install-trunk --enable-debug-runtime --enable-jit --enable-targets=x86,x86_64,nvptx
      make install
      
    2. Get libclc (git clone http://llvm.org/git/libclc.git) and compile it.

      ./configure.py --with-llvm-config=$PWD/../install-trunk/bin/llvm-config
      make
      

    If you have problem compiling this you might need to fix couple of headers in ./utils/prepare-builtins.cpp

    -#include "llvm/Function.h"
    -#include "llvm/GlobalVariable.h"
    -#include "llvm/LLVMContext.h"
    -#include "llvm/Module.h"
    +#include "llvm/IR/Function.h"
    +#include "llvm/IR/GlobalVariable.h"
    +#include "llvm/IR/LLVMContext.h"
    +#include "llvm/IR/Module.h"
    
    1. Compile kernel to LLVM IR assember:

      clang -Dcl_clang_storage_class_specifiers -isystem libclc/generic/include -include clc/clc.h -target nvptx -xcl square.cl -emit-llvm -S -o square.ll
      
    2. Link kernel with builtin implementations from libclc

      llvm-link libclc/nvptx--nvidiacl/lib/builtins.bc square.ll -o square.linked.bc
      
    3. Compile fully linked LLVM IR to PTX

      clang -target nvptx square.linked.bc -S -o square.nvptx.s
      

    square.nvptx.s:

        //
        // Generated by LLVM NVPTX Back-End
        //
        .version 3.1
        .target sm_20, texmode_independent
        .address_size 32
    
                // .globl       vector_square
    
        .entry vector_square(
                .param .u32 .ptr .global .align 16 vector_square_param_0,
                .param .u32 .ptr .global .align 16 vector_square_param_1
        )
        {
                .reg .pred %p<396>;
                .reg .s16 %rc<396>;
                .reg .s16 %rs<396>;
                .reg .s32 %r<396>;
                .reg .s64 %rl<396>;
                .reg .f32 %f<396>;
                .reg .f64 %fl<396>;
    
                ld.param.u32    %r0, [vector_square_param_0];
                mov.u32 %r1, %ctaid.x;
                ld.param.u32    %r2, [vector_square_param_1];
                mov.u32 %r3, %ntid.x;
                mov.u32 %r4, %tid.x;
                mad.lo.s32      %r1, %r3, %r1, %r4;
                shl.b32         %r1, %r1, 4;
                add.s32         %r0, %r0, %r1;
                ld.global.v4.f32        {%f0, %f1, %f2, %f3}, [%r0];
                mul.f32         %f0, %f0, %f0;
                mul.f32         %f1, %f1, %f1;
                mul.f32         %f2, %f2, %f2;
                mul.f32         %f3, %f3, %f3;
                add.s32         %r0, %r2, %r1;
                st.global.f32   [%r0+12], %f3;
                st.global.f32   [%r0+8], %f2;
                st.global.f32   [%r0+4], %f1;
                st.global.f32   [%r0], %f0;
                ret;
        }
    

提交回复
热议问题