Intro to icgrep
icgrep
icgrep is an SFU-developed command-line tool for regular expression search
- designed as a modern high-performance replacement for the traditional grep program
- supports:
- traditional Posix regular expressions (grep),
- Perl-compatible regular expressions (PCRE),
- full Unicode regular expressions of UTS #18: Unicode Regular Expressions
- implemented using parallel techniques
- bitwise data parallel regular expression matching
- SIMD parallelism
- multithread parallelism
- designed for outstanding performance with complicated regular expressions and big data applications
The Symbolic Data Domains
- RE: regular expressions
- Pablo: tiny programming language for arbitrary-length bit streams
- LLVM IR: portable intermediate representation/high-level assembly language
- ASM: assembler language of the target machine.
When using icgrep, you can always display the symbolic output of the various phases using the command flags
- -ShowREs
- -ShowPablo
- -ShowIR
- -ShowASM
The Phases of icgrep
- RE phases
- parse REs into internal AST (abstract syntax tree form)
- perform RE to RE transformations, for example
- Case-insensitive transformation
- Star Normal Form
- compile REs to Pablo using the Parabix regular expression algorithm
- Pablo phases
- perform Pablo to Pablo transformations, for example
- common subexpression elimination
- dead-code elimination
- sinking
- compile Pablo into LLVM IR
- perform Pablo to Pablo transformations, for example
- LLVM IR passes
- IR to IR transformations
- compile LLVM IR to target machine architecture
- ASM phases
- ASM to ASM transformations, such as peephole optimization
- generate final binary machine code
Example
./icgrep '(?i)a(?:[bc]*[d-f]*)*[a-z]{2,9}t' -ShowAllREs -ShowPablo -ShowIR -ShowASM
RE Passes
Parser: Group(+i:(Seq[CC "Unicode_61" ,Rep((Seq[Rep(CC "Unicode_62_63" ,0,Unbounded),Rep(CC "Unicode_64_66" ,0,Unbounded)]),0,Unbounded),Rep(CC "Unicode_61_7a" ,2,9),CC "Unicode_74" ])) UnneededCaptureRemoval: Group(+i:(Seq[CC "Unicode_61" ,Rep((Seq[Rep(CC "Unicode_62_63" ,0,Unbounded),Rep(CC "Unicode_64_66" ,0,Unbounded)]),0,Unbounded),Rep(CC "Unicode_61_7a" ,2,9),CC "Unicode_74" ])) EscapeNames: Group(+i:(Seq[CC "Unicode_61" ,Rep((Seq[Rep(CC "Unicode_62_63" ,0,Unbounded),Rep(CC "Unicode_64_66" ,0,Unbounded)]),0,Unbounded),Rep(CC "Unicode_61_7a" ,2,9),CC "Unicode_74" ])) ResolveGraphemeMode: Group(+i:(Seq[CC "Unicode_61" ,Rep((Seq[Rep(CC "Unicode_62_63" ,0,Unbounded),Rep(CC "Unicode_64_66" ,0,Unbounded)]),0,Unbounded),Rep(CC "Unicode_61_7a" ,2,9),CC "Unicode_74" ])) PropertyLinker: Group(+i:(Seq[CC "Unicode_61" ,Rep((Seq[Rep(CC "Unicode_62_63" ,0,Unbounded),Rep(CC "Unicode_64_66" ,0,Unbounded)]),0,Unbounded),Rep(CC "Unicode_61_7a" ,2,9),CC "Unicode_74" ])) PropertyStandardization: Group(+i:(Seq[CC "Unicode_61" ,Rep((Seq[Rep(CC "Unicode_62_63" ,0,Unbounded),Rep(CC "Unicode_64_66" ,0,Unbounded)]),0,Unbounded),Rep(CC "Unicode_61_7a" ,2,9),CC "Unicode_74" ])) PropertyResolver: Group(+i:(Seq[CC "Unicode_61" ,Rep((Seq[Rep(CC "Unicode_62_63" ,0,Unbounded),Rep(CC "Unicode_64_66" ,0,Unbounded)]),0,Unbounded),Rep(CC "Unicode_61_7a" ,2,9),CC "Unicode_74" ])) SimplePropertyInliner: Group(+i:(Seq[CC "Unicode_61" ,Rep((Seq[Rep(CC "Unicode_62_63" ,0,Unbounded),Rep(CC "Unicode_64_66" ,0,Unbounded)]),0,Unbounded),Rep(CC "Unicode_61_7a" ,2,9),CC "Unicode_74" ])) PropertyExternalizer: Group(+i:(Seq[CC "Unicode_61" ,Rep((Seq[Rep(CC "Unicode_62_63" ,0,Unbounded),Rep(CC "Unicode_64_66" ,0,Unbounded)]),0,Unbounded),Rep(CC "Unicode_61_7a" ,2,9),CC "Unicode_74" ])) NamesDefinedValidator success: Group(+i:(Seq[CC "Unicode_61" ,Rep((Seq[Rep(CC "Unicode_62_63" ,0,Unbounded),Rep(CC "Unicode_64_66" ,0,Unbounded)]),0,Unbounded),Rep(CC "Unicode_61_7a" ,2,9),CC "Unicode_74" ])) CaseInsensitize: (Seq[CC "Unicode_41,61" ,Rep((Seq[Rep(CC "Unicode_42_43,62_63" ,0,Unbounded),Rep(CC "Unicode_44_46,64_66" ,0,Unbounded)]),0,Unbounded),Rep(CC "Unicode_41_5a,61_7a,17f,212a" ,2,9),CC "Unicode_54,74" ]) ContextualAssertionSimplifier: (Seq[CC "Unicode_41,61" ,Rep((Seq[Rep(CC "Unicode_42_43,62_63" ,0,Unbounded),Rep(CC "Unicode_44_46,64_66" ,0,Unbounded)]),0,Unbounded),Rep(CC "Unicode_41_5a,61_7a,17f,212a" ,2,9),CC "Unicode_54,74" ]) Exclude: (Seq[CC "Unicode_41,61" ,Rep((Seq[Rep(CC "Unicode_42_43,62_63" ,0,Unbounded),Rep(CC "Unicode_44_46,64_66" ,0,Unbounded)]),0,Unbounded),Rep(CC "Unicode_41_5a,61_7a,17f,212a" ,2,9),CC "Unicode_54,74" ]) Anchor Resolution: (Seq[CC "Unicode_41,61" ,Rep((Seq[Rep(CC "Unicode_42_43,62_63" ,0,Unbounded),Rep(CC "Unicode_44_46,64_66" ,0,Unbounded)]),0,Unbounded),Rep(CC "Unicode_41_5a,61_7a,17f,212a" ,2,9),CC "Unicode_54,74" ]) NullablePrefixRemoval: (Seq[CC "Unicode_41,61" ,Rep((Seq[Rep(CC "Unicode_42_43,62_63" ,0,Unbounded),Rep(CC "Unicode_44_46,64_66" ,0,Unbounded)]),0,Unbounded),Rep(CC "Unicode_41_5a,61_7a,17f,212a" ,2,9),CC "Unicode_54,74" ]) NullableSuffixRemoval: (Seq[CC "Unicode_41,61" ,Rep((Seq[Rep(CC "Unicode_42_43,62_63" ,0,Unbounded),Rep(CC "Unicode_44_46,64_66" ,0,Unbounded)]),0,Unbounded),Rep(CC "Unicode_41_5a,61_7a,17f,212a" ,2,9),CC "Unicode_54,74" ]) StarNormal: (Seq[CC "Unicode_41,61" ,Rep(CC "Unicode_42_46,62_66" ,0,Unbounded),Rep(CC "Unicode_41_5a,61_7a,17f,212a" ,2,9),CC "Unicode_54,74" ]) Simplifier: (Seq[CC "Unicode_41,61" ,Rep(CC "Unicode_42_46,62_66" ,0,Unbounded),Rep(CC "Unicode_41_5a,61_7a,17f,212a" ,2,9),CC "Unicode_54,74" ]) DiffResolver: (Seq[CC "Unicode_41,61" ,Rep(CC "Unicode_42_46,62_66" ,0,Unbounded),Rep(CC "Unicode_41_5a,61_7a,17f,212a" ,2,9),CC "Unicode_54,74" ]) Anchor Resolution: (Seq[CC "Unicode_41,61" ,Rep(CC "Unicode_42_46,62_66" ,0,Unbounded),Rep(CC "Unicode_41_5a,61_7a,17f,212a" ,2,9),CC "Unicode_54,74" ]) FixedUTF8Validator failure: (Seq[CC "Unicode_41,61" ,Rep(CC "Unicode_42_46,62_66" ,0,Unbounded),Rep(CC "Unicode_41_5a,61_7a,17f,212a" ,2,9),CC "Unicode_54,74" ]) Multiplex_mpx: (Seq[CC "mpx_1" ,Rep(CC "mpx_2" ,0,Unbounded),Rep(CC "mpx_1_4" ,2,9),CC "mpx_4" ])
Pablo
### Initial Pablo AST ### ... kernel icc97ca8517af7a59334aaa19f2bfff28095b42ae9 :: [<i1>[3] Unicode_basis, <i1>[3] mpx_basis] -> [<i1>[1] matches] { final_matches = <0> not = ~mpx_basis[1] not_1 = ~mpx_basis[2] and = mpx_basis[0] & not and_1 = not_1 & and mpx_1 = InFile(and_1) advance = Advance(mpx_1, 1) not_2 = ~mpx_basis[0] and_2 = mpx_basis[1] & not_2 and_3 = not_1 & and_2 mpx_2 = InFile(and_3) unbounded = MatchStar(advance, mpx_2) or = mpx_basis[0] | mpx_basis[1] not_3 = ~or xor = mpx_basis[2] ^ or mpx_1_4 = InFile(xor) advance_1 = Advance(mpx_1_4, 1) at2of2 = mpx_1_4 & advance_1 marker_fwd = Advance(unbounded, 1) lowerbound = at2of2 & marker_fwd advance_2 = Advance(lowerbound, 1) within1 = lowerbound | advance_2 advance_3 = Advance(within1, 1) within2 = within1 | advance_3 advance_4 = Advance(within2, 2) within4 = within2 | advance_4 advance_5 = Advance(within4, 3) within7 = within4 | advance_5 masked = mpx_1_4 & within7 matchstar = MatchStar(lowerbound, masked) bounded = within7 & matchstar m = <0> if bounded { advance_6 = Advance(bounded, 1) and_4 = mpx_basis[2] & not_3 mpx_4 = InFile(and_4) and_5 = advance_6 & mpx_4 m = and_5 } advance_7 = Advance(m, 1) final_matches = advance_7 matches[0] = final_matches } ...
LLVM IR
; ModuleID = 'FD_source16384@8_AVX2' source_filename = "FD_source16384@8_AVX2" target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-apple-darwin19.6.0" %"FD_source16384@8_shared_state" = type { i64, i32, i64, i8*, i8*, i64, { [1 x [8 x <4 x i64>]]*, i64 } } ; Function Attrs: norecurse define i64 @"FD_source16384@8_Initialize"(%"FD_source16384@8_shared_state"* %shared, i64 %useMMap, i32 %fileDescriptor) #0 { entry: %0 = getelementptr inbounds %"FD_source16384@8_shared_state", %"FD_source16384@8_shared_state"* %shared, i32 0, i32 0 %1 = getelementptr inbounds %"FD_source16384@8_shared_state", %"FD_source16384@8_shared_state"* %shared, i32 0, i32 1 %2 = getelementptr inbounds %"FD_source16384@8_shared_state", %"FD_source16384@8_shared_state"* %shared, i32 0, i32 2 %3 = getelementptr inbounds %"FD_source16384@8_shared_state", %"FD_source16384@8_shared_state"* %shared, i32 0, i32 3 %4 = getelementptr inbounds %"FD_source16384@8_shared_state", %"FD_source16384@8_shared_state"* %shared, i32 0, i32 4 %5 = getelementptr inbounds %"FD_source16384@8_shared_state", %"FD_source16384@8_shared_state"* %shared, i32 0, i32 5 %6 = getelementptr inbounds %"FD_source16384@8_shared_state", %"FD_source16384@8_shared_state"* %shared, i32 0, i32 6 store i64 %useMMap, i64* %0 store i32 %fileDescriptor, i32* %1 %7 = load i64, i64* %0 %8 = load i32, i32* %1 %9 = icmp ne i32 %8, 0 %10 = icmp ne i64 %7, 0 %11 = and i1 %10, %9 br i1 %11, label %checkFileSize, label %initializeRead ...
Intel/AMD x86_64 ASM
_FD_source16384@8_Initialize: ## @"FD_source16384@8_Initialize" .cfi_startproc ## BB#0: ## %entry pushq %rbp Lcfi0: .cfi_def_cfa_offset 16 pushq %rbx Lcfi1: .cfi_def_cfa_offset 24 subq $232, %rsp Lcfi2: .cfi_def_cfa_offset 256 Lcfi3: .cfi_offset %rbx, -24 Lcfi4: .cfi_offset %rbp, -16 movq %rdi, %rax addq $8, %rax movq %rdi, %rcx addq $16, %rcx movq %rdi, %r8 addq $24, %r8 movq %rdi, %r9 addq $32, %r9 movq %rdi, %r10 addq $40, %r10 movq %rdi, %r11 addq $48, %r11 movq %rsi, (%rdi) movl %edx, 8(%rdi) cmpl $0, %edx setne %bl cmpq $0, %rsi setne %bpl andb %bl, %bpl testb $1, %bpl movq %r11, 224(%rsp) ## 8-byte Spill movq %rdi, 216(%rsp) ## 8-byte Spill movl %edx, 212(%rsp) ## 4-byte Spill movq %rax, 200(%rsp) ## 8-byte Spill movq %rcx, 192(%rsp) ## 8-byte Spill movq %r8, 184(%rsp) ## 8-byte Spill movq %r9, 176(%rsp) ## 8-byte Spill movq %r10, 168(%rsp) ## 8-byte Spill jne LBB0_2 ...
Updated Fri May 14 2021, 12:20 by cameron.