SIMD Explorations With icgrep

The Phases of icgrep

icgrep parses regular expressions into an AST form.
Regular expression ASTs are compiled into Pablo, a language on unbounded bitstreams.
Pablo is compiled to block-at-a-time LLVM IR, where the block size is typically 128 or 256 bytes.
LLVM IR is dynamically compiled (just-in-time compilation) to machine code.
The machine code is executed to process the given data files.

Displaying the Output of Phases

Use -print-parsed-REs to show parsed regular expressions.

 ./icgrep 'ab*c|de{1,7}f' -print-parsed-REs
Parser:
(Alt[(Seq[Name "CC_61" ,Rep(Name "CC_62" ,0,Unbounded),Name "CC_63" ]),(Seq[Name "CC_64" ,Rep(Name "CC_65" ,1,7),Name "CC_66" ])])

Use -print-pablo to show the Pablo code resulting from compiling REs.}}}

./icgrep 'ab*c|de{1,7}f' -print-pablo
Final Pablo AST:
not_ = (~basis7)
not_1 = (~basis5)
not_2 = (~basis3)
not_3 = (~basis2)
not_4 = (~basis1)
...
...

Use -dump-generated-IR to show the LLVM IR produced by the Pablo compiler.

./icgrep 'ab*c|de{1,7}f' -dump-generated-IR
; ModuleID = 'grepcode'

@process_block_carry_data = common global [28 x <2 x i64>] zeroinitializer, align 16
@blockNo = common global i64 0, align 16

; Function Attrs: nounwind uwtable
define void @s2p_block(<2 x i64>* nocapture readonly %byte_data, { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> }* nocapture readnone %basis_bits) #0 {
...
...

To see machine assembly code, first dump the IR to a file and then use llc -filetype=asm.

./icgrep 'ab*c|de{1,7}f' -dump-generated-IR >& abcdef.ll
../libllvm/bin/llc -filetype=asm abcdef.ll
cat abcdef.s
	.text
	.file	"abcdef.ll"
	.section	.rodata.cst16,"aM",@progbits,16
	.align	16
.LCPI0_0:
	.short	255                     # 0xff
	.short	255                     # 0xff
	.short	255                     # 0xff
	.short	255                     # 0xff
	.short	255                     # 0xff
	.short	255                     # 0xff
	.short	255                     # 0xff
	.short	255                     # 0xff
.LCPI0_1:
	.zero	16,170
.LCPI0_2:
	.zero	16,204
.LCPI0_3:
	.zero	16,240
	.text
	.globl	s2p_block
	.align	16, 0x90
	.type	s2p_block,@function
s2p_block:                              # @s2p_block
	.cfi_startproc
# BB#0:                                 # %entry
	movdqa	(%rdi), %xmm14
	movdqa	16(%rdi), %xmm1
	movdqa	32(%rdi), %xmm5
	movdqa	48(%rdi), %xmm2
	pxor	%xmm9, %xmm9
...
...

Default code-generation with llc targets SSE2 instructions. We can target AVX instructions instead:

cp abcdef.ll abcdef-sse2.ll
cp abcdef.ll abcdef-avx.ll
../libllvm/bin/llc -filetype=asm -mattr=+avx abcdef-avx.ll
 ../libllvm/bin/llc -filetype=asm abcdef-sse2.ll
diff -y abcdef-sse2.s abcdef-avx.s
# BB#0:                                 # %entry		# BB#0:                                 # %entry
	movdqa	(%rdi), %xmm14				      |		vmovdqa	(%rdi), %xmm1
	movdqa	16(%rdi), %xmm1				      |		vmovdqa	16(%rdi), %xmm2
	movdqa	32(%rdi), %xmm5				      |		vmovdqa	32(%rdi), %xmm8
	movdqa	48(%rdi), %xmm2				      |		vmovdqa	48(%rdi), %xmm3
	pxor	%xmm9, %xmm9				      |		vmovdqa	.LCPI0_0(%rip), %xmm9   # xmm9 = [128,128,128
	movdqa	%xmm1, %xmm3				      |		vpshufb	%xmm9, %xmm2, %xmm4
	punpckhbw	%xmm9, %xmm3    # xmm3 = xmm3[8],xmm9 |		vmovdqa	.LCPI0_1(%rip), %xmm11  # xmm11 = [1,3,5,7,9,
	pshuflw	$-25, %xmm3, %xmm3      # xmm3 = xmm3[3,1,2,3 |		vpshufb	%xmm11, %xmm1, %xmm5
	pshufhw	$-25, %xmm3, %xmm3      # xmm3 = xmm3[0,1,2,3 |		vpor	%xmm4, %xmm5, %xmm6
	pshufd	$-24, %xmm3, %xmm3      # xmm3 = xmm3[0,2,2,3 |		vmovdqa	.LCPI0_2(%rip), %xmm12  # xmm12 = [128,128,12
	pshuflw	$-79, %xmm3, %xmm3      # xmm3 = xmm3[1,0,3,2 |		vpshufb	%xmm12, %xmm2, %xmm2
	movdqa	%xmm1, %xmm4				      |		vmovdqa	.LCPI0_3(%rip), %xmm13  # xmm13 = [0,2,4,6,8,
	punpcklbw	%xmm9, %xmm4    # xmm4 = xmm4[0],xmm9 |		vpshufb	%xmm13, %xmm1, %xmm1
	pshuflw	$-25, %xmm4, %xmm4      # xmm4 = xmm4[3,1,2,3 |		vpor	%xmm2, %xmm1, %xmm2
	pshufhw	$-25, %xmm4, %xmm4      # xmm4 = xmm4[0,1,2,3 |		vpsrlw	$1, %xmm2, %xmm1
	pshufd	$-24, %xmm4, %xmm4      # xmm4 = xmm4[0,2,2,3 |		vmovdqa	.LCPI0_4(%rip), %xmm4   # xmm4 = [12297829382
	pshuflw	$-79, %xmm4, %xmm4      # xmm4 = xmm4[1,0,3,2 |		vpand	%xmm6, %xmm4, %xmm7
	punpcklqdq	%xmm3, %xmm4    # xmm4 = xmm4[0],xmm3 |		vpextrq	$1, %xmm4, %rax
	movdqa	%xmm14, %xmm3				      |		notq	%rax
	punpckhbw	%xmm9, %xmm3    # xmm3 = xmm3[8],xmm9 |		vmovq	%rax, %xmm5
	pshuflw	$-25, %xmm3, %xmm3      # xmm3 = xmm3[3,1,2,3 |		vmovq	%xmm4, %rax
	pshufhw	$-25, %xmm3, %xmm3      # xmm3 = xmm3[0,1,2,3 <
	pshufd	$-24, %xmm3, %xmm3      # xmm3 = xmm3[0,2,2,3 <
	pshuflw	$-79, %xmm3, %xmm6      # xmm6 = xmm3[1,0,3,2 <
	movdqa	%xmm14, %xmm3				      <
	punpcklbw	%xmm9, %xmm3    # xmm3 = xmm3[0],xmm9 <
	pshuflw	$-25, %xmm3, %xmm3      # xmm3 = xmm3[3,1,2,3 <
	pshufhw	$-25, %xmm3, %xmm3      # xmm3 = xmm3[0,1,2,3 <
	pshufd	$-24, %xmm3, %xmm3      # xmm3 = xmm3[0,2,2,3 <
	pshuflw	$-79, %xmm3, %xmm3      # xmm3 = xmm3[1,0,3,2 <
	punpcklqdq	%xmm6, %xmm3    # xmm3 = xmm3[0],xmm6 <
	packuswb	%xmm4, %xmm3			      <
	movdqa	.LCPI0_0(%rip), %xmm8   # xmm8 = [255,255,255 <
	pand	%xmm8, %xmm1				      <
	pand	%xmm8, %xmm14				      <
	packuswb	%xmm1, %xmm14			      <
	movdqa	%xmm14, %xmm12				      <
	psrlw	$1, %xmm12				      <
	movdqa	.LCPI0_1(%rip), %xmm10  # xmm10 = [1229782938 <
	movdqa	%xmm10, %xmm1				      <
	pand	%xmm3, %xmm1				      <
	movabsq	$6148914691236517205, %rax # imm = 0x55555555 <
	movd	%rax, %xmm4				      <
	movd	%xmm10, %rax				      <

Updated Tue Jan. 12 2016, 16:29 by cameron.

Simon Fraser University
Engaging the World

CourSys

SIMD Explorations With icgrep

The Phases of icgrep

Displaying the Output of Phases