Not logged in. Login

Intro to icgrep

icgrep

icgrep is an SFU-developed command-line tool for regular expression search

  • designed as a modern high-performance replacement for the traditional grep program
  • implemented using parallel techniques
    • bitwise data parallel regular expression matching
    • SIMD parallelism
    • multithread parallelism
  • designed for outstanding performance with complicated regular expressions and big data applications

The Symbolic Data Domains

  • RE: regular expressions
  • Pablo: tiny programming language for unbounded bit streams
  • LLVM IR: portable intermediate representation/high-level assembly language
  • ASM: assembler language of the target machine.

When using icgrep, you can always display the symbolic output of the various phases using the command flags

  • -ShowREs
  • -ShowPablo
  • -ShowIR
  • -ShowASM

The Phases of icgrep

  • RE phases
    • parse REs into internal AST (abstract syntax tree form)
    • perform RE to RE transformations, for example
      • Case-insensitive transformation
      • Star Normal Form
    • compile REs to Pablo using the Parabix regular expression algorithm
  • Pablo phases
    • perform Pablo to Pablo transformations, for example
      • common subexpression elimination
      • dead-code elimination
      • sinking
    • compile Pablo into LLVM IR
  • LLVM IR passes
    • IR to IR transformations
    • compile LLVM IR to target machine architecture
  • ASM phases
    • ASM to ASM transformations, such as peephole optimization
    • generate final binary machine code

Example

./icgrep '(?i)a(?:[bc]*[d-f]*)*[a-z]{2,9}t' -ShowAllREs -ShowPablo -ShowIR -ShowASM

RE Passes

Parser:
Group(+i:(Seq[CC "CC_61" [97]/Unicode,Rep((Seq[Rep(CC "CC_62_63" [98-99]/Unicode,0,Unbounded),Rep(CC "CC_64_66" [100-102]/Unicode,0,Unbounded)]),0,Unbounded),Rep(CC "CC_61_7a" [97-122]/Unicode,2,9),CC "CC_74" [116]/Unicode]))
resolveGraphemeMode:
Group(+i:(Seq[CC "CC_61" [97]/Unicode,Rep((Seq[Rep(CC "CC_62_63" [98-99]/Unicode,0,Unbounded),Rep(CC "CC_64_66" [100-102]/Unicode,0,Unbounded)]),0,Unbounded),Rep(CC "CC_61_7a" [97-122]/Unicode,2,9),CC "CC_74" [116]/Unicode]))
resolveUnicodeProperties:
Group(+i:(Seq[CC "CC_61" [97]/Unicode,Rep((Seq[Rep(CC "CC_62_63" [98-99]/Unicode,0,Unbounded),Rep(CC "CC_64_66" [100-102]/Unicode,0,Unbounded)]),0,Unbounded),Rep(CC "CC_61_7a" [97-122]/Unicode,2,9),CC "CC_74" [116]/Unicode]))
resolveCaseInsensitiveMode:
(Seq[CC "CC_41,61" [65][97]/Unicode,Rep((Seq[Rep(CC "CC_42_43,62_63" [66-67][98-99]/Unicode,0,Unbounded),Rep(CC "CC_44_46,64_66" [68-70][100-102]/Unicode,0,Unbounded)]),0,Unbounded),Rep(CC "CC_41_5a,61_7a,17f,212a" [65-90][97-122][383][8490]/Unicode,2,9),CC "CC_54,74" [84][116]/Unicode])
excludeUnicodeLineBreak:
(Seq[CC "CC_41,61" [65][97]/Unicode,Rep((Seq[Rep(CC "CC_42_43,62_63" [66-67][98-99]/Unicode,0,Unbounded),Rep(CC "CC_44_46,64_66" [68-70][100-102]/Unicode,0,Unbounded)]),0,Unbounded),Rep(CC "CC_41_5a,61_7a,17f,212a" [65-90][97-122][383][8490]/Unicode,2,9),CC "CC_54,74" [84][116]/Unicode])
RemoveNullablePrefix:
(Seq[CC "CC_41,61" [65][97]/Unicode,Rep((Seq[Rep(CC "CC_42_43,62_63" [66-67][98-99]/Unicode,0,Unbounded),Rep(CC "CC_44_46,64_66" [68-70][100-102]/Unicode,0,Unbounded)]),0,Unbounded),Rep(CC "CC_41_5a,61_7a,17f,212a" [65-90][97-122][383][8490]/Unicode,2,9),CC "CC_54,74" [84][116]/Unicode])
RemoveNullableSuffix:
(Seq[CC "CC_41,61" [65][97]/Unicode,Rep((Seq[Rep(CC "CC_42_43,62_63" [66-67][98-99]/Unicode,0,Unbounded),Rep(CC "CC_44_46,64_66" [68-70][100-102]/Unicode,0,Unbounded)]),0,Unbounded),Rep(CC "CC_41_5a,61_7a,17f,212a" [65-90][97-122][383][8490]/Unicode,2,9),CC "CC_54,74" [84][116]/Unicode])
Star_Normal_Form:
(Seq[CC "CC_41,61" [65][97]/Unicode,Rep(CC "CC_42_46,62_66" [66-70][98-102]/Unicode,0,Unbounded),Rep(CC "CC_41_5a,61_7a,17f,212a" [65-90][97-122][383][8490]/Unicode,2,9),CC "CC_54,74" [84][116]/Unicode])
resolveNames:
(Seq[CC "CC_41,61" [65][97]/Unicode,Rep(CC "CC_42_46,62_66" [66-70][98-102]/Unicode,0,Unbounded),Rep(CC "CC_41_5a,61_7a,17f,212a" [65-90][97-122][383][8490]/Unicode,2,9),CC "CC_54,74" [84][116]/Unicode])
Simplifier:
(Seq[CC "CC_41,61" [65][97]/Unicode,Rep(CC "CC_42_46,62_66" [66-70][98-102]/Unicode,0,Unbounded),Rep(CC "CC_41_5a,61_7a,17f,212a" [65-90][97-122][383][8490]/Unicode,2,9),CC "CC_54,74" [84][116]/Unicode])

Pablo

Initial Pablo AST:
basis[0] = Extract basis, 0
basis[1] = Extract basis, 1
basis[2] = Extract basis, 2
basis[3] = Extract basis, 3
basis[4] = Extract basis, 4
basis[5] = Extract basis, 5
basis[6] = Extract basis, 6
basis[7] = Extract basis, 7
not = (~basis[7])
not_1 = (~basis[5])
not_2 = (~basis[3])
not_3 = (~basis[2])
not_4 = (~basis[1])
not_5 = (~basis[0])
and = (basis[6] & not)
and_1 = (basis[4] & not_1)
or = (basis[2] | basis[3])
not_6 = (~or)
or_1 = (basis[0] | basis[1])
not_7 = (~or_1)
and_2 = (and & and_1)
or_2 = (or | or_1)
not_8 = (~or_2)
LF = (and_2 & not_8)
...

LLVM IR

; ModuleID = 'FD_source16@8_O1_SSE2:B'
target triple = "x86_64-apple-darwin14.5.0"

%"FD_source16@8_O1" = type { i64, i32, { [1 x [8 x <2 x i64>]]*, i64, i64 }*, i8*, i64, { i64, i64** }*, i64, i64, i64, i64 }

; Function Attrs: nounwind
define void @"FD_source16@8_O1_Init"(%"FD_source16@8_O1"* %self, i32 %fileDescriptor, { [1 x [8 x <2 x i64>]]*, i64, i64 }* %sourceBuffer_bufferPtr, { i64, i64** }* %sourceBufferConsumerLocks) #0 {
entry:
  store %"FD_source16@8_O1" zeroinitializer, %"FD_source16@8_O1"* %self, align 8
  %0 = getelementptr %"FD_source16@8_O1", %"FD_source16@8_O1"* %self, i64 0, i32 1
  store i32 %fileDescriptor, i32* %0, align 4
  %1 = getelementptr %"FD_source16@8_O1", %"FD_source16@8_O1"* %self, i64 0, i32 2
  store { [1 x [8 x <2 x i64>]]*, i64, i64 }* %sourceBuffer_bufferPtr, { [1 x [8 x <2 x i64>]]*, i64, i64 }** %1, align 8
  %2 = getelementptr %"FD_source16@8_O1", %"FD_source16@8_O1"* %self, i64 0, i32 5
  store { i64, i64** }* %sourceBufferConsumerLocks, { i64, i64** }** %2, align 8
  %3 = icmp eq i32 %fileDescriptor, 0
  br i1 %3, label %initializeRead, label %initializeMMap

...

Intel/AMD x86_64 ASM

FD_source16@8_O1_Init:                  ## @"FD_source16@8_O1_Init"
## BB#0:                                ## %entry
	pushq	%rbp
	movq	%rsp, %rbp
	pushq	%r15
	pushq	%r14
	pushq	%rbx
	pushq	%rax
	movl	%esi, %r15d
	movq	%rdi, %rbx
	movq	$0, 72(%rbx)
	movq	$0, 64(%rbx)
	movq	$0, 56(%rbx)
	movq	$0, 48(%rbx)
	movq	$0, 40(%rbx)
	movq	$0, 32(%rbx)
	movq	$0, 24(%rbx)
	movq	$0, 16(%rbx)
	movl	$0, 8(%rbx)
	movq	$0, (%rbx)
	movl	%r15d, 8(%rbx)
	movq	%rdx, 16(%rbx)
	movq	%rcx, 40(%rbx)
	testl	%r15d, %r15d
	je	LBB0_1
## BB#3:                                ## %initializeMMap
	movabsq	$file_size, %rax
	movl	%r15d, %edi
	callq	*%rax
	movq	%rax, %r14
	movabsq	$mmap, %rax
	testq	%r14, %r14
	je	LBB0_4
## BB#5:                                ## %NonEmptyFile
	xorl	%edi, %edi
	movl	$1, %edx
	movl	$2, %ecx
	xorl	%r9d, %r9d
	movq	%r14, %rsi
	movl	%r15d, %r8d
	callq	*%rax
	movq	%r14, %r15
...
Updated Wed Jan. 03 2018, 09:30 by cameron.