Assignment 2: Programming LLVM IR in a Multiblock Kernel
In this assignment, you will program LLVM IR to implement a new MultiBlock kernel in Parabix.
PackDemo Example
You can use the packdemo.cpp program below as a model.
/* * Part of the Parabix Project, under the Open Software License 3.0. * SPDX-License-Identifier: OSL-3.0 */ #include <kernel/io/source_kernel.h> #include <kernel/io/stdout_kernel.h> // for StdOutKernel_ #include <llvm/IR/Function.h> // for Function, Function... #include <llvm/IR/Module.h> // for Module #include <llvm/Support/CommandLine.h> // for ParseCommandLineOp... #include <llvm/Support/Debug.h> // for dbgs #include <kernel/core/kernel_builder.h> #include <toolchain/toolchain.h> #include <kernel/pipeline/driver/cpudriver.h> #include <kernel/core/streamset.h> #include <kernel/io/stdout_kernel.h> #include <llvm/ADT/StringRef.h> #include <kernel/pipeline/pipeline_builder.h> #include <fcntl.h> #define SHOW_STREAM(name) if (codegen::EnableIllustrator) P->captureBitstream(#name, name) #define SHOW_BIXNUM(name) if (codegen::EnableIllustrator) P->captureBixNum(#name, name) #define SHOW_BYTES(name) if (codegen::EnableIllustrator) P->captureByteData(#name, name) using namespace kernel; using namespace llvm; using namespace codegen; static cl::OptionCategory PackDemoOptions("Pack Demo Options", "Pack demo options."); static cl::opt<std::string> inputFile(cl::Positional, cl::desc("<input file>"), cl::Required, cl::cat(PackDemoOptions)); class PackKernel final : public MultiBlockKernel { public: PackKernel(KernelBuilder & b, StreamSet * const byteStream, StreamSet * const Packed); static constexpr unsigned fw = 8; static constexpr unsigned inputRate = 2; static constexpr unsigned outputRate = 1; protected: void generateMultiBlockLogic(KernelBuilder & b, llvm::Value * const numOfStrides) override; }; PackKernel::PackKernel(KernelBuilder & b, StreamSet * const byteStream, StreamSet * const Packed) : MultiBlockKernel(b, "pack_kernel", {Binding{"byteStream", byteStream, FixedRate(inputRate)}}, {Binding{"Packed", Packed, FixedRate(outputRate)}}, {}, {}, {}) {} void PackKernel::generateMultiBlockLogic(KernelBuilder & b, Value * const numOfStrides) { const unsigned inputPacksPerStride = fw * inputRate; const unsigned outputPacksPerStride = fw * outputRate; BasicBlock * entry = b.GetInsertBlock(); BasicBlock * packLoop = b.CreateBasicBlock("packLoop"); BasicBlock * packFinalize = b.CreateBasicBlock("packFinalize"); Constant * const ZERO = b.getSize(0); Value * numOfBlocks = numOfStrides; if (getStride() != b.getBitBlockWidth()) { numOfBlocks = b.CreateShl(numOfStrides, b.getSize(std::log2(getStride()/b.getBitBlockWidth()))); llvm::errs() << "stride = " << getStride() << "\n"; } b.CreateBr(packLoop); b.SetInsertPoint(packLoop); PHINode * blockOffsetPhi = b.CreatePHI(b.getSizeTy(), 2); blockOffsetPhi->addIncoming(ZERO, entry); Value * bytepack[inputPacksPerStride]; for (unsigned i = 0; i < inputPacksPerStride; i++) { bytepack[i] = b.loadInputStreamPack("byteStream", ZERO, b.getInt32(i), blockOffsetPhi); } Value * packed[outputPacksPerStride]; for (unsigned i = 0; i < outputPacksPerStride; i++) { packed[i] = b.hsimd_packh(16, bytepack[2*i], bytepack[2*i+1]); b.storeOutputStreamPack("Packed", ZERO, b.getInt32(i), blockOffsetPhi, packed[i]); } Value * nextBlk = b.CreateAdd(blockOffsetPhi, b.getSize(1)); blockOffsetPhi->addIncoming(nextBlk, packLoop); Value * moreToDo = b.CreateICmpNE(nextBlk, numOfBlocks); b.CreateCondBr(moreToDo, packLoop, packFinalize); b.SetInsertPoint(packFinalize); } typedef void (*PackDemoFunctionType)(uint32_t fd); PackDemoFunctionType packdemo_gen (CPUDriver & driver) { auto & b = driver.getBuilder(); auto P = driver.makePipeline({Binding{b.getInt32Ty(), "inputFileDecriptor"}}, {}); Scalar * fileDescriptor = P->getInputScalar("inputFileDecriptor"); // Source data StreamSet * const codeUnitStream = P->CreateStreamSet(1, 8); P->CreateKernelCall<ReadSourceKernel>(fileDescriptor, codeUnitStream); StreamSet * const packedStream = P->CreateStreamSet(1, 8); P->CreateKernelCall<PackKernel>(codeUnitStream, packedStream); P->CreateKernelCall<StdOutKernel>(packedStream); return reinterpret_cast<PackDemoFunctionType>(P->compile()); } int main(int argc, char *argv[]) { codegen::ParseCommandLineOptions(argc, argv, {&PackDemoOptions, codegen::codegen_flags()}); CPUDriver pxDriver("packdemo"); const int fd = open(inputFile.c_str(), O_RDONLY); if (LLVM_UNLIKELY(fd == -1)) { errs() << "Error: cannot open " << inputFile << " for processing. Skipped.\n"; } else { PackDemoFunctionType func = nullptr; func = packdemo_gen(pxDriver); func(fd); close(fd); } return 0; }
Note that BlockSize is defined as the SIMD register width in bits. Blocksize is the number of items processed per data block. When items are wider than 1 bit, the Blocksize needs to be multiplied by the fieldwidth to determine the total length in bits. Similarly, when accessing items using SIMD operations, the number of registers full of data that need to be processed is based on the fieldwidth.
To complete the packdemo implementation, you can include the cpp file in tools/util and update the CMakeLists.txt file to include:
parabix_add_executable( NAME packdemo SRC packdemo.cpp DEPS kernel.io kernel.pipeline kernel.streamutils kernel.util toolchain )
Assignment
Implement a multiblock kernel to perform a function of your choice.
One possibility is to implement a kernel to expand the input by a factor of two. You can use SIMD expansion operations in the IDISA_builder such as esimd_mergeh, esimd_mergel. Alternatively, use could use ZExt or SExt operations of LLVM.
Another possibility is to implement a kernel to produce an accumulated value through processing of a file. In this case, you will need to add an InternalScalar to hold the value you compute after each block.