diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json new file mode 100644 index 0000000..4039bef --- /dev/null +++ b/.vscode/c_cpp_properties.json @@ -0,0 +1,16 @@ +{ + "configurations": [ + { + "name": "Linux", + "includePath": [ + "${workspaceFolder}/**" + ], + "defines": [], + "compilerPath": "/usr/bin/gcc", + "cStandard": "c17", + "cppStandard": "gnu++17", + "intelliSenseMode": "linux-gcc-x64" + } + ], + "version": 4 +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..90a6474 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,12 @@ +{ + "C_Cpp.errorSquiggles": "disabled", + "files.associations": { + "limits": "cpp", + "iostream": "cpp", + "cmath": "cpp", + "optional": "cpp", + "new": "cpp", + "ostream": "cpp", + "numbers": "cpp" + } +} \ No newline at end of file diff --git a/aot/idw.s b/aot/idw.s index 647cee3..d8d564c 100644 --- a/aot/idw.s +++ b/aot/idw.s @@ -1,450 +1,362 @@ - .text - .attribute 4, 16 - .attribute 5, "rv64gcv0p7" - .file "halide_buffer_t.cpp" - .section .text.idw,"ax",@progbits - .globl idw # -- Begin function idw - .p2align 1 - .type idw,@function -idw: # @idw -# %bb.0: # %entry - addi sp, sp, -192 - sd ra, 184(sp) # 8-byte Folded Spill - sd s0, 176(sp) # 8-byte Folded Spill - sd s1, 168(sp) # 8-byte Folded Spill - sd s2, 160(sp) # 8-byte Folded Spill - sd s3, 152(sp) # 8-byte Folded Spill - sd s4, 144(sp) # 8-byte Folded Spill - sd s5, 136(sp) # 8-byte Folded Spill - sd s6, 128(sp) # 8-byte Folded Spill - sd s7, 120(sp) # 8-byte Folded Spill - sd s8, 112(sp) # 8-byte Folded Spill - sd s9, 104(sp) # 8-byte Folded Spill - sd s10, 96(sp) # 8-byte Folded Spill - sd s11, 88(sp) # 8-byte Folded Spill -.Lpcrel_hi0: - auipc a1, %pcrel_hi(.Lb2.buffer) - addi a1, a1, %pcrel_lo(.Lpcrel_hi0) - ld s6, 16(a1) - ld a3, 40(a0) -.Lpcrel_hi1: - auipc a2, %pcrel_hi(.Lb3.buffer) - addi a2, a2, %pcrel_lo(.Lpcrel_hi1) - ld s9, 16(a2) - lw s4, 0(a3) - lwu s8, 4(a3) - lw a4, 16(a3) - sd a4, 8(sp) # 8-byte Folded Spill - ld a4, 0(a1) - lwu a5, 20(a3) - sd a5, 32(sp) # 8-byte Folded Spill - lw s1, 24(a3) - or a5, s6, a4 - mv a4, s9 - bnez a5, .LBB0_2 -# %bb.1: # %then_bb - lui a4, 128 - addiw a4, a4, 9 - slli a4, a4, 13 - sd a4, 32(a1) - ld a4, 40(a1) - sd zero, 0(a1) - sd zero, 8(a1) - sd zero, 16(a1) - sw zero, 0(a4) - li a5, 300 - sw a5, 4(a4) - li a5, 1 - sw a5, 8(a4) - sw zero, 12(a4) - ld a4, 16(a2) - sd zero, 24(a1) -.LBB0_2: # %after_bb - ld a5, 0(a2) - ld s11, 16(a0) - or a4, a4, a5 - bnez a4, .LBB0_4 -# %bb.3: # %then_bb2 - sd zero, 16(a2) - sd zero, 8(a2) - sd zero, 0(a2) - lui a4, 128 - addiw a4, a4, 9 - ld a5, 40(a2) - slli a4, a4, 13 - addi a4, a4, 2 - sd a4, 32(a2) - sw zero, 0(a5) - li a4, 100 - sw a4, 4(a5) - li a4, 1 - sw a4, 8(a5) - sw zero, 12(a5) - sd zero, 24(a2) -.LBB0_4: # %after_bb1 - sext.w a4, s8 - sd a4, 24(sp) # 8-byte Folded Spill - lw a4, 32(sp) # 8-byte Folded Reload - sd s1, 16(sp) # 8-byte Folded Spill - beqz s11, .LBB0_6 -# %bb.5: - li a0, 0 - j .LBB0_9 -.LBB0_6: # %_halide_buffer_is_bounds_query.exit36 - ld a5, 0(a0) - bnez a5, .LBB0_8 -# %bb.7: # %then_bb5 - sd zero, 16(a0) - sd zero, 8(a0) - sd zero, 0(a0) - lui a5, 256 - addiw a5, a5, 9 - slli a5, a5, 13 - addi a5, a5, 2 - sd a5, 32(a0) - sw s4, 0(a3) - ld s1, 24(sp) # 8-byte Folded Reload - sw s1, 4(a3) - li a5, 1 - sw a5, 8(a3) - sw zero, 12(a3) - ld a5, 8(sp) # 8-byte Folded Reload - sw a5, 16(a3) - sw a4, 20(a3) - sw s1, 24(a3) - sw zero, 28(a3) - sd zero, 24(a0) -.LBB0_8: # %land.rhs.i49 - ld a0, 0(a0) - seqz a0, a0 -.LBB0_9: # %_halide_buffer_is_bounds_query.exit50 - ld a3, 16(a1) - ld a1, 0(a1) - ld a5, 16(a2) - ld a2, 0(a2) - or a1, a1, a3 - seqz a1, a1 - or a2, a2, a5 - seqz a2, a2 - or a1, a1, a2 - or a0, a0, a1 - slti a1, a4, 1 - or a0, a0, a1 - bnez a0, .LBB0_20 -# %bb.10: # %"for idw.s0.y.rebased.preheader" - ld s3, 16(sp) # 8-byte Folded Reload - ld a0, 24(sp) # 8-byte Folded Reload - blez a0, .LBB0_13 -# %bb.11: # %"for idw.s0.y.rebased.us.preheader" - li s0, 0 - ld s2, 24(sp) # 8-byte Folded Reload - slli s2, s2, 2 - ld s1, 32(sp) # 8-byte Folded Reload -.LBB0_12: # %"for idw.s0.y.rebased.us" - # =>This Inner Loop Header: Depth=1 - slli a0, s0, 2 - add a0, a0, s11 - li a1, 0 - mv a2, s2 - call memset@plt - addi s1, s1, -1 - addw s0, s0, s3 - bnez s1, .LBB0_12 -.LBB0_13: # %"for idw.s1.y.rebased.preheader" - sd zero, 40(sp) # 8-byte Folded Spill - li a1, 6 - li a2, 1 - addi a3, sp, 64 - addi a7, sp, 76 - addi t0, sp, 72 - addi t1, sp, 68 - li s0, 4 - addi s1, sp, 48 - addi t2, sp, 52 - addi t3, sp, 60 - addi t6, sp, 56 - li ra, 2 - li a6, 3 -.LBB0_14: # %"for idw.s1.y.rebased" - # =>This Loop Header: Depth=1 - # Child Loop BB0_16 Depth 2 - # Child Loop BB0_17 Depth 3 - ld a0, 24(sp) # 8-byte Folded Reload - blez a0, .LBB0_19 -# %bb.15: # %"for idw.s1.x.rebased.preheader" - # in Loop: Header=BB0_14 Depth=1 - li t4, 0 - ld a0, 16(sp) # 8-byte Folded Reload - ld a4, 40(sp) # 8-byte Folded Reload - mulw t5, a0, a4 - ld a0, 8(sp) # 8-byte Folded Reload - addw a0, a0, a4 -.LBB0_16: # %"for idw.s1.x.rebased" - # Parent Loop BB0_14 Depth=1 - # => This Loop Header: Depth=2 - # Child Loop BB0_17 Depth 3 - add a4, t4, t5 - slli a4, a4, 2 - add s2, s11, a4 - flw ft0, 0(s2) - addw a4, s4, t4 - li s5, 25 - mv s3, s6 - mv s10, s9 -.LBB0_17: # %"for idw.s1.r4$x.r4$x" - # Parent Loop BB0_14 Depth=1 - # Parent Loop BB0_16 Depth=2 - # => This Inner Loop Header: Depth=3 - addi s7, s3, 4 - vsetvli zero, a1, e32, m2 - vlwu.v v8, (s7) - vsetvli zero, a2, e32, m2 - vslidedown.vi v10, v8, 3 - addi a5, s3, 20 - vsetvli zero, a1, e32, m2 - vlwu.v v12, (a5) - vsetvli zero, a2, e32, m2 - vslidedown.vi v14, v12, 2 - vslidedown.vi v12, v12, 5 - vsw.v v8, (a3) - vsw.v v12, (a7) - vsw.v v14, (t0) - vsw.v v10, (t1) - vsetvli zero, s0, e32, m1 - vlwu.v v8, (a3) - vsetvli a5, zero, e32, m1 - vrsub.vx v8, v8, a4 - vsetvli zero, a1, e32, m2 - vlwu.v v10, (s3) - vsetvli zero, a2, e32, m2 - vslidedown.vi v12, v10, 3 - addi a5, s3, 16 - vsetvli zero, a1, e32, m2 - vlwu.v v14, (a5) - vsetvli zero, a2, e32, m2 - vslidedown.vi v16, v14, 2 - vslidedown.vi v14, v14, 5 - vsw.v v10, (s1) - vsw.v v12, (t2) - vsw.v v14, (t3) - vsw.v v16, (t6) - vsetvli zero, s0, e32, m1 - vlwu.v v9, (s1) - vsetvli a5, zero, e32, m1 - vrsub.vx v9, v9, a0 - vmul.vv v9, v9, v9 - vmacc.vv v9, v8, v8 - vfcvt.f.x.v v8, v9 - vfmv.f.s ft1, v8 - fsqrt.s ft1, ft1 - vfmv.s.f v9, ft1 - vsetvli zero, a2, e32, m1 - vslidedown.vi v10, v8, 1 - vfmv.f.s ft1, v10 - fsqrt.s ft1, ft1 - vsetvli a5, zero, e32, m1 - vfmv.s.f v10, ft1 - vsetvli zero, ra, e32, m1 - vslideup.vi v9, v10, 1 - vsetvli zero, a2, e32, m1 - vslidedown.vi v10, v8, 2 - vfmv.f.s ft1, v10 - fsqrt.s ft1, ft1 - vsetvli a5, zero, e32, m1 - vfmv.s.f v10, ft1 - vsetvli zero, a6, e32, m1 - vslideup.vi v9, v10, 2 - vsetvli zero, a2, e32, m1 - vslidedown.vi v8, v8, 3 - vfmv.f.s ft1, v8 - fsqrt.s ft1, ft1 - vsetvli a5, zero, e32, m1 - vfmv.s.f v8, ft1 - vsetvli zero, s0, e32, m1 - vslideup.vi v9, v8, 3 - vsetvli zero, zero, e32, m1 - vlwu.v v8, (s10) - vfmul.vv v8, v9, v8 - vfmv.s.f v9, ft0 - vfredsum.vs v8, v8, v9 - vfmv.f.s ft0, v8 - addi s5, s5, -1 - addi s10, s10, 16 - addi s3, s3, 48 - bnez s5, .LBB0_17 -# %bb.18: # %"end for idw.s1.r4$x.r4$x" - # in Loop: Header=BB0_16 Depth=2 - addi t4, t4, 1 - fsw ft0, 0(s2) - bne t4, s8, .LBB0_16 -.LBB0_19: # %"end for idw.s1.x.rebased" - # in Loop: Header=BB0_14 Depth=1 - ld a4, 40(sp) # 8-byte Folded Reload - addi a4, a4, 1 - ld a0, 32(sp) # 8-byte Folded Reload - sd a4, 40(sp) # 8-byte Folded Spill - bne a4, a0, .LBB0_14 -.LBB0_20: # %destructor_block - li a0, 0 - ld ra, 184(sp) # 8-byte Folded Reload - ld s0, 176(sp) # 8-byte Folded Reload - ld s1, 168(sp) # 8-byte Folded Reload - ld s2, 160(sp) # 8-byte Folded Reload - ld s3, 152(sp) # 8-byte Folded Reload - ld s4, 144(sp) # 8-byte Folded Reload - ld s5, 136(sp) # 8-byte Folded Reload - ld s6, 128(sp) # 8-byte Folded Reload - ld s7, 120(sp) # 8-byte Folded Reload - ld s8, 112(sp) # 8-byte Folded Reload - ld s9, 104(sp) # 8-byte Folded Reload - ld s10, 96(sp) # 8-byte Folded Reload - ld s11, 88(sp) # 8-byte Folded Reload - addi sp, sp, 192 - ret -.Lfunc_end0: - .size idw, .Lfunc_end0-idw - # -- End function - .section .text.idw_argv,"ax",@progbits - .globl idw_argv # -- Begin function idw_argv - .p2align 1 - .type idw_argv,@function -idw_argv: # @idw_argv -# %bb.0: # %entry - addi sp, sp, -16 - sd ra, 8(sp) # 8-byte Folded Spill - ld a0, 0(a0) - call idw@plt - li a0, 0 - ld ra, 8(sp) # 8-byte Folded Reload - addi sp, sp, 16 - ret -.Lfunc_end1: - .size idw_argv, .Lfunc_end1-idw_argv - # -- End function - .section .text.idw_metadata,"ax",@progbits - .globl idw_metadata # -- Begin function idw_metadata - .p2align 1 - .type idw_metadata,@function -idw_metadata: # @idw_metadata -# %bb.0: # %entry -.Lpcrel_hi2: - auipc a0, %pcrel_hi(.Lidw_metadata_storage) - addi a0, a0, %pcrel_lo(.Lpcrel_hi2) - ret -.Lfunc_end2: - .size idw_metadata, .Lfunc_end2-idw_metadata - # -- End function - .type .Lb2.shape,@object # @b2.shape - .section .rodata,"a",@progbits - .p2align 5, 0x0 -.Lb2.shape: - .asciz "\000\000\000\000,\001\000\000\001\000\000\000\000\000\000" - .size .Lb2.shape, 16 - - .type .Lb2.data,@object # @b2.data - .p2align 5, 0x0 -.Lb2.data: - .asciz "\000\000\000\000\000\000\000\000H\000\000\000\000\000\000\000\325\000\000\000O\000\000\000\000\000\000\000\252\001\000\000<\000\000\000\000\000\000\000\200\002\000\000L\000\000\000\000\000\000\000U\003\000\000\200\000\000\000\000\000\000\000*\004\000\000C\000\000\000\000\000\000\000\000\005\000\000A\000\000\000\000\000\000\000\325\005\000\000@\000\000\000\000\000\000\000\252\006\000\000<\000\000\000\000\000\000\000\200\007\000\000=\000\000\000x\000\000\000\000\000\000\000Q\000\000\000x\000\000\000\325\000\000\000O\000\000\000x\000\000\000\252\001\000\000:\000\000\000x\000\000\000\200\002\000\000\204\000\000\000x\000\000\000U\003\000\000\225\000\000\000x\000\000\000*\004\000\000\216\000\000\000x\000\000\000\000\005\000\000@\000\000\000x\000\000\000\325\005\000\000E\000\000\000x\000\000\000\252\006\000\000A\000\000\000x\000\000\000\200\007\000\000@\000\000\000\360\000\000\000\000\000\000\000K\000\000\000\360\000\000\000\325\000\000\000D\000\000\000\360\000\000\000\252\001\000\000\214\000\000\000\360\000\000\000\200\002\000\000\231\000\000\000\360\000\000\000U\003\000\000\221\000\000\000\360\000\000\000*\004\000\000\204\000\000\000\360\000\000\000\000\005\000\000\230\000\000\000\360\000\000\000\325\005\000\000}\000\000\000\360\000\000\000\252\006\000\000B\000\000\000\360\000\000\000\200\007\000\000:\000\000\000h\001\000\000\000\000\000\000N\000\000\000h\001\000\000\325\000\000\000<\000\000\000h\001\000\000\252\001\000\000\213\000\000\000h\001\000\000\200\002\000\000\250\000\000\000h\001\000\000U\003\000\000\232\000\000\000h\001\000\000*\004\000\000\212\000\000\000h\001\000\000\000\005\000\000\221\000\000\000h\001\000\000\325\005\000\000\240\000\000\000h\001\000\000\252\006\000\000D\000\000\000h\001\000\000\200\007\000\000<\000\000\000\340\001\000\000\000\000\000\000M\000\000\000\340\001\000\000\325\000\000\000;\000\000\000\340\001\000\000\252\001\000\000\245\000\000\000\340\001\000\000\200\002\000\000\267\000\000\000\340\001\000\000U\003\000\000\246\000\000\000\340\001\000\000*\004\000\000\216\000\000\000\340\001\000\000\000\005\000\000{\000\000\000\340\001\000\000\325\005\000\000\233\000\000\000\340\001\000\000\252\006\000\000\220\000\000\000\340\001\000\000\200\007\000\000<\000\000\000X\002\000\000\000\000\000\000S\000\000\000X\002\000\000\325\000\000\000A\000\000\000X\002\000\000\252\001\000\000\262\000\000\000X\002\000\000\200\002\000\000\270\000\000\000X\002\000\000U\003\000\000\212\000\000\000X\002\000\000*\004\000\000|\000\000\000X\002\000\000\000\005\000\000\204\000\000\000X\002\000\000\325\005\000\000\257\000\000\000X\002\000\000\252\006\000\000\257\000\000\000X\002\000\000\200\007\000\000<\000\000\000\320\002\000\000\000\000\000\000V\000\000\000\320\002\000\000\325\000\000\000<\000\000\000\320\002\000\000\252\001\000\000\266\000\000\000\320\002\000\000\200\002\000\000\263\000\000\000\320\002\000\000U\003\000\000\230\000\000\000\320\002\000\000*\004\000\000\214\000\000\000\320\002\000\000\000\005\000\000s\000\000\000\320\002\000\000\325\005\000\000\234\000\000\000\320\002\000\000\252\006\000\000\254\000\000\000\320\002\000\000\200\007\000\000A\000\000\000H\003\000\000\000\000\000\000Z\000\000\000H\003\000\000\325\000\000\000E\000\000\000H\003\000\000\252\001\000\000\244\000\000\000H\003\000\000\200\002\000\000\272\000\000\000H\003\000\000U\003\000\000\222\000\000\000H\003\000\000*\004\000\000\223\000\000\000H\003\000\000\000\005\000\000z\000\000\000H\003\000\000\325\005\000\000\230\000\000\000H\003\000\000\252\006\000\000\246\000\000\000H\003\000\000\200\007\000\000E\000\000\000\300\003\000\000\000\000\000\000U\000\000\000\300\003\000\000\325\000\000\000C\000\000\000\300\003\000\000\252\001\000\000\272\000\000\000\300\003\000\000\200\002\000\000\251\000\000\000\300\003\000\000U\003\000\000\242\000\000\000\300\003\000\000*\004\000\000\223\000\000\000\300\003\000\000\000\005\000\000|\000\000\000\300\003\000\000\325\005\000\000\225\000\000\000\300\003\000\000\252\006\000\000\235\000\000\000\300\003\000\000\200\007\000\000E\000\000\0008\004\000\000\000\000\000\000U\000\000\0008\004\000\000\325\000\000\000F\000\000\0008\004\000\000\252\001\000\000\301\000\000\0008\004\000\000\200\002\000\000\300\000\000\0008\004\000\000U\003\000\000\233\000\000\0008\004\000\000*\004\000\000\207\000\000\0008\004\000\000\000\005\000\000x\000\000\0008\004\000\000\325\005\000\000\212\000\000\0008\004\000\000\252\006\000\000\200\000\000\0008\004\000\000\200\007\000\000F\000\000" - .size .Lb2.data, 1200 - - .type .Lb2.buffer,@object # @b2.buffer - .data - .p2align 4, 0x0 -.Lb2.buffer: - .quad 0 # 0x0 - .quad 0 - .quad .Lb2.data - .quad 1 # 0x1 - .byte 0 # 0x0 - .byte 32 # 0x20 - .half 1 # 0x1 - .word 1 # 0x1 - .quad .Lb2.shape - .quad 0 - .size .Lb2.buffer, 56 - - .type .Lb3.shape,@object # @b3.shape - .section .rodata,"a",@progbits - .p2align 5, 0x0 -.Lb3.shape: - .asciz "\000\000\000\000d\000\000\000\001\000\000\000\000\000\000" - .size .Lb3.shape, 16 - - .type .Lb3.data,@object # @b3.data - .p2align 5, 0x0 -.Lb3.data: - .ascii "\211\004&=\244\022\233\275\217U%;\272)J>#\026\024\276@_\234>\340\243|\275\334\034K\275\232\300\004:A\313\245\274\303\024r\275\345\313a\275x\231\300>\201\334\"\276\236}\321\275H\346\257\276\202V\310>\225\375(>},\026\274E\235W\275\241\236\236<\245\220\222=\004\361\263\276\2620\240\274\265\036\240=\\:\336=\240\016\263\276\260\224\371\275p\235(=\311\\x;\226y\\\275\360\227\377=\275\364\200=rw/<@U\017=c\327\003=nV\267\273\3543\207\311\274<&{\274h\b\031>\203\222\003\276\213\3649\275d\177\017\276\330\306\246\275'\314:>\203\024\253=S\331M\276\257\263S=b\221q\275\336\t\306=#^\005\2763)\256\274\344\255G>\224!\374=\374\360o\275\215#\031\276\322`J\276y6\360=\273\303;\275N\"N>M\022K\276\016\035C=1Fb\275u\017\220\275\0040\033>,\037\223=t\335\304\275\363\344\271=\320\341\260\275\2265\255=\310\"b=&\031\330\275;\314\000>>\347o\275\255\222\020=C\rU<\370d\307\275\373\306\206=I\345\373\274Y|\034>WY;\276O\240?>B\347\226\275\203\271\203\275W\243\313<\214\364\343\274Ey\033\276J4\214=\261\304\241\274-\271p>X\313l\276*_\366\275\260^\\=\237l1=\371\272\200=\002\373?\274*\3578<]Gv=" - .size .Lb3.data, 400 - - .type .Lb3.buffer,@object # @b3.buffer - .data - .p2align 4, 0x0 -.Lb3.buffer: - .quad 0 # 0x0 - .quad 0 - .quad .Lb3.data - .quad 1 # 0x1 - .byte 2 # 0x2 - .byte 32 # 0x20 - .half 1 # 0x1 - .word 1 # 0x1 - .quad .Lb3.shape - .quad 0 - .size .Lb3.buffer, 56 - - .type .L__unnamed_1,@object # @0 - .section .rodata,"a",@progbits - .p2align 4, 0x0 -.L__unnamed_1: - .zero 32 - .size .L__unnamed_1, 32 - - .type .Lstr,@object # @str - .p2align 5, 0x0 -.Lstr: - .asciz "idw" - .size .Lstr, 4 - - .type .L__unnamed_2,@object # @1 - .section .data.rel.ro,"aw",@progbits - .p2align 4, 0x0 -.L__unnamed_2: - .quad .Lstr - .word 2 # 0x2 - .word 2 # 0x2 - .byte 2 # 0x2 - .byte 32 # 0x20 - .half 1 # 0x1 - .zero 4 - .quad 0 - .quad 0 - .quad 0 - .quad 0 - .quad .L__unnamed_1 - .size .L__unnamed_2, 64 - - .type .Lstr.4,@object # @str.4 - .section .rodata,"a",@progbits - .p2align 5, 0x0 -.Lstr.4: - .asciz "riscv-64-linux-no_asserts-no_runtime-rvv-vector_bits_128" - .size .Lstr.4, 57 - - .type .Lidw_metadata_storage,@object # @idw_metadata_storage - .section .data.rel.ro,"aw",@progbits - .p2align 4, 0x0 -.Lidw_metadata_storage: - .word 1 # 0x1 - .word 1 # 0x1 - .quad .L__unnamed_2 - .quad .Lstr.4 - .quad .Lstr - .size .Lidw_metadata_storage, 32 - - .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" - .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" - .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" - .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" - .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" - .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" - .section ".note.GNU-stack","",@progbits + .text + .attribute 4, 16 + .attribute 5, "rv64gcv0p7" + .file "halide_buffer_t.cpp" + .section .text.idw,"ax",@progbits + .globl idw # -- Begin function idw + .p2align 1 + .type idw,@function +idw: # @idw +# %bb.0: # %entry + addi sp, sp, -96 + sd ra, 88(sp) # 8-byte Folded Spill + sd s0, 80(sp) # 8-byte Folded Spill + sd s1, 72(sp) # 8-byte Folded Spill + sd s2, 64(sp) # 8-byte Folded Spill + sd s3, 56(sp) # 8-byte Folded Spill + sd s4, 48(sp) # 8-byte Folded Spill + sd s5, 40(sp) # 8-byte Folded Spill + sd s6, 32(sp) # 8-byte Folded Spill + sd s7, 24(sp) # 8-byte Folded Spill + sd s8, 16(sp) # 8-byte Folded Spill + sd s9, 8(sp) # 8-byte Folded Spill + sd s10, 0(sp) # 8-byte Folded Spill +.Lpcrel_hi0: + auipc a1, %pcrel_hi(.Lb2.buffer) + addi a1, a1, %pcrel_lo(.Lpcrel_hi0) + ld s8, 16(a1) + ld a3, 40(a0) +.Lpcrel_hi1: + auipc a2, %pcrel_hi(.Lb3.buffer) + addi a2, a2, %pcrel_lo(.Lpcrel_hi1) + ld s9, 16(a2) + lw s3, 0(a3) + lw s7, 4(a3) + lw s4, 16(a3) + ld a4, 0(a1) + lwu s5, 20(a3) + lw s6, 24(a3) + or a5, s8, a4 + mv a4, s9 + beqz a5, .LBB0_4 +# %bb.1: # %after_bb + ld a5, 0(a2) + ld s10, 16(a0) + or a4, a4, a5 + beqz a4, .LBB0_5 +.LBB0_2: # %after_bb1 + sext.w a4, s5 + beqz s10, .LBB0_6 +.LBB0_3: + li a0, 0 + j .LBB0_9 +.LBB0_4: # %then_bb + lui a4, 128 + addiw a4, a4, 9 + slli a4, a4, 13 + sd a4, 32(a1) + ld a4, 40(a1) + sd zero, 0(a1) + sd zero, 8(a1) + sd zero, 16(a1) + sw zero, 0(a4) + li a5, 300 + sw a5, 4(a4) + li a5, 1 + sw a5, 8(a4) + sw zero, 12(a4) + ld a4, 16(a2) + sd zero, 24(a1) + ld a5, 0(a2) + ld s10, 16(a0) + or a4, a4, a5 + bnez a4, .LBB0_2 +.LBB0_5: # %then_bb2 + sd zero, 16(a2) + sd zero, 8(a2) + sd zero, 0(a2) + lui a4, 128 + addiw a4, a4, 9 + ld a5, 40(a2) + slli a4, a4, 13 + addi a4, a4, 2 + sd a4, 32(a2) + sw zero, 0(a5) + li a4, 100 + sw a4, 4(a5) + li a4, 1 + sw a4, 8(a5) + sw zero, 12(a5) + sd zero, 24(a2) + sext.w a4, s5 + bnez s10, .LBB0_3 +.LBB0_6: # %_halide_buffer_is_bounds_query.exit36 + ld a5, 0(a0) + bnez a5, .LBB0_8 +# %bb.7: # %then_bb5 + sd zero, 16(a0) + sd zero, 8(a0) + sd zero, 0(a0) + lui a5, 256 + addiw a5, a5, 9 + slli a5, a5, 13 + addi a5, a5, 2 + sd a5, 32(a0) + sw s3, 0(a3) + sw s7, 4(a3) + li a5, 1 + sw a5, 8(a3) + sw zero, 12(a3) + sw s4, 16(a3) + sw a4, 20(a3) + sw s7, 24(a3) + sw zero, 28(a3) + sd zero, 24(a0) +.LBB0_8: # %land.rhs.i49 + ld a0, 0(a0) + seqz a0, a0 +.LBB0_9: # %_halide_buffer_is_bounds_query.exit50 + ld a3, 16(a1) + ld a1, 0(a1) + ld a5, 16(a2) + ld a2, 0(a2) + or a1, a1, a3 + seqz a1, a1 + or a2, a2, a5 + seqz a2, a2 + or a1, a1, a2 + or a0, a0, a1 + slti a1, a4, 1 + or a0, a0, a1 + bnez a0, .LBB0_20 +# %bb.10: # %"for idw.s0.y.rebased.preheader" + blez s7, .LBB0_20 +# %bb.11: # %"for idw.s0.y.rebased.us.preheader" + li s0, 0 + slli s2, s7, 2 + mv s1, s5 +.LBB0_12: # %"for idw.s0.y.rebased.us" + # =>This Inner Loop Header: Depth=1 + slli a0, s0, 2 + add a0, a0, s10 + li a1, 0 + mv a2, s2 + call memset@plt + addi s1, s1, -1 + addw s0, s0, s6 + bnez s1, .LBB0_12 +# %bb.13: # %"for idw.s1.y.rebased.preheader" + blez s7, .LBB0_20 +# %bb.14: # %"for idw.s1.y.rebased.us.preheader" + li a6, 0 + addi s8, s8, 4 +.LBB0_15: # %"for idw.s1.y.rebased.us" + # =>This Loop Header: Depth=1 + # Child Loop BB0_16 Depth 2 + # Child Loop BB0_17 Depth 3 + li a1, 0 + mulw a7, s6, a6 + add a3, s4, a6 +.LBB0_16: # %"for idw.s1.x.rebased.us" + # Parent Loop BB0_15 Depth=1 + # => This Loop Header: Depth=2 + # Child Loop BB0_17 Depth 3 + add a0, a1, a7 + slli a0, a0, 2 + add t0, s10, a0 + flw ft0, 0(t0) + add a5, s3, a1 + li a0, 100 + mv s1, s8 + mv s0, s9 +.LBB0_17: # %"for idw.s1.r4$x.us" + # Parent Loop BB0_15 Depth=1 + # Parent Loop BB0_16 Depth=2 + # => This Inner Loop Header: Depth=3 + lw a2, 0(s1) + lw a4, -4(s1) + subw a2, a5, a2 + subw a4, a3, a4 + mulw a2, a2, a2 + mulw a4, a4, a4 + flw ft1, 0(s0) + add a2, a2, a4 + fcvt.s.w ft2, a2 + fsqrt.s ft2, ft2 + fmadd.s ft0, ft2, ft1, ft0 + addi a0, a0, -1 + addi s0, s0, 4 + addi s1, s1, 12 + bnez a0, .LBB0_17 +# %bb.18: # %"end for idw.s1.r4$x.us" + # in Loop: Header=BB0_16 Depth=2 + addi a1, a1, 1 + fsw ft0, 0(t0) + bne a1, s7, .LBB0_16 +# %bb.19: # %"end for idw.s1.x.rebased.loopexit.us" + # in Loop: Header=BB0_15 Depth=1 + addi a6, a6, 1 + bne a6, s5, .LBB0_15 +.LBB0_20: # %destructor_block + li a0, 0 + ld ra, 88(sp) # 8-byte Folded Reload + ld s0, 80(sp) # 8-byte Folded Reload + ld s1, 72(sp) # 8-byte Folded Reload + ld s2, 64(sp) # 8-byte Folded Reload + ld s3, 56(sp) # 8-byte Folded Reload + ld s4, 48(sp) # 8-byte Folded Reload + ld s5, 40(sp) # 8-byte Folded Reload + ld s6, 32(sp) # 8-byte Folded Reload + ld s7, 24(sp) # 8-byte Folded Reload + ld s8, 16(sp) # 8-byte Folded Reload + ld s9, 8(sp) # 8-byte Folded Reload + ld s10, 0(sp) # 8-byte Folded Reload + addi sp, sp, 96 + ret +.Lfunc_end0: + .size idw, .Lfunc_end0-idw + # -- End function + .section .text.idw_argv,"ax",@progbits + .globl idw_argv # -- Begin function idw_argv + .p2align 1 + .type idw_argv,@function +idw_argv: # @idw_argv +# %bb.0: # %entry + addi sp, sp, -16 + sd ra, 8(sp) # 8-byte Folded Spill + ld a0, 0(a0) + call idw@plt + li a0, 0 + ld ra, 8(sp) # 8-byte Folded Reload + addi sp, sp, 16 + ret +.Lfunc_end1: + .size idw_argv, .Lfunc_end1-idw_argv + # -- End function + .section .text.idw_metadata,"ax",@progbits + .globl idw_metadata # -- Begin function idw_metadata + .p2align 1 + .type idw_metadata,@function +idw_metadata: # @idw_metadata +# %bb.0: # %entry +.Lpcrel_hi2: + auipc a0, %pcrel_hi(.Lidw_metadata_storage) + addi a0, a0, %pcrel_lo(.Lpcrel_hi2) + ret +.Lfunc_end2: + .size idw_metadata, .Lfunc_end2-idw_metadata + # -- End function + .type .Lb2.shape,@object # @b2.shape + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lb2.shape: + .asciz "\000\000\000\000,\001\000\000\001\000\000\000\000\000\000" + .size .Lb2.shape, 16 + + .type .Lb2.data,@object # @b2.data + .p2align 5, 0x0 +.Lb2.data: + .asciz "\000\000\000\000\000\000\000\000H\000\000\000\000\000\000\000\325\000\000\000O\000\000\000\000\000\000\000\252\001\000\000<\000\000\000\000\000\000\000\200\002\000\000L\000\000\000\000\000\000\000U\003\000\000\200\000\000\000\000\000\000\000*\004\000\000C\000\000\000\000\000\000\000\000\005\000\000A\000\000\000\000\000\000\000\325\005\000\000@\000\000\000\000\000\000\000\252\006\000\000<\000\000\000\000\000\000\000\200\007\000\000=\000\000\000x\000\000\000\000\000\000\000Q\000\000\000x\000\000\000\325\000\000\000O\000\000\000x\000\000\000\252\001\000\000:\000\000\000x\000\000\000\200\002\000\000\204\000\000\000x\000\000\000U\003\000\000\225\000\000\000x\000\000\000*\004\000\000\216\000\000\000x\000\000\000\000\005\000\000@\000\000\000x\000\000\000\325\005\000\000E\000\000\000x\000\000\000\252\006\000\000A\000\000\000x\000\000\000\200\007\000\000@\000\000\000\360\000\000\000\000\000\000\000K\000\000\000\360\000\000\000\325\000\000\000D\000\000\000\360\000\000\000\252\001\000\000\214\000\000\000\360\000\000\000\200\002\000\000\231\000\000\000\360\000\000\000U\003\000\000\221\000\000\000\360\000\000\000*\004\000\000\204\000\000\000\360\000\000\000\000\005\000\000\230\000\000\000\360\000\000\000\325\005\000\000}\000\000\000\360\000\000\000\252\006\000\000B\000\000\000\360\000\000\000\200\007\000\000:\000\000\000h\001\000\000\000\000\000\000N\000\000\000h\001\000\000\325\000\000\000<\000\000\000h\001\000\000\252\001\000\000\213\000\000\000h\001\000\000\200\002\000\000\250\000\000\000h\001\000\000U\003\000\000\232\000\000\000h\001\000\000*\004\000\000\212\000\000\000h\001\000\000\000\005\000\000\221\000\000\000h\001\000\000\325\005\000\000\240\000\000\000h\001\000\000\252\006\000\000D\000\000\000h\001\000\000\200\007\000\000<\000\000\000\340\001\000\000\000\000\000\000M\000\000\000\340\001\000\000\325\000\000\000;\000\000\000\340\001\000\000\252\001\000\000\245\000\000\000\340\001\000\000\200\002\000\000\267\000\000\000\340\001\000\000U\003\000\000\246\000\000\000\340\001\000\000*\004\000\000\216\000\000\000\340\001\000\000\000\005\000\000{\000\000\000\340\001\000\000\325\005\000\000\233\000\000\000\340\001\000\000\252\006\000\000\220\000\000\000\340\001\000\000\200\007\000\000<\000\000\000X\002\000\000\000\000\000\000S\000\000\000X\002\000\000\325\000\000\000A\000\000\000X\002\000\000\252\001\000\000\262\000\000\000X\002\000\000\200\002\000\000\270\000\000\000X\002\000\000U\003\000\000\212\000\000\000X\002\000\000*\004\000\000|\000\000\000X\002\000\000\000\005\000\000\204\000\000\000X\002\000\000\325\005\000\000\257\000\000\000X\002\000\000\252\006\000\000\257\000\000\000X\002\000\000\200\007\000\000<\000\000\000\320\002\000\000\000\000\000\000V\000\000\000\320\002\000\000\325\000\000\000<\000\000\000\320\002\000\000\252\001\000\000\266\000\000\000\320\002\000\000\200\002\000\000\263\000\000\000\320\002\000\000U\003\000\000\230\000\000\000\320\002\000\000*\004\000\000\214\000\000\000\320\002\000\000\000\005\000\000s\000\000\000\320\002\000\000\325\005\000\000\234\000\000\000\320\002\000\000\252\006\000\000\254\000\000\000\320\002\000\000\200\007\000\000A\000\000\000H\003\000\000\000\000\000\000Z\000\000\000H\003\000\000\325\000\000\000E\000\000\000H\003\000\000\252\001\000\000\244\000\000\000H\003\000\000\200\002\000\000\272\000\000\000H\003\000\000U\003\000\000\222\000\000\000H\003\000\000*\004\000\000\223\000\000\000H\003\000\000\000\005\000\000z\000\000\000H\003\000\000\325\005\000\000\230\000\000\000H\003\000\000\252\006\000\000\246\000\000\000H\003\000\000\200\007\000\000E\000\000\000\300\003\000\000\000\000\000\000U\000\000\000\300\003\000\000\325\000\000\000C\000\000\000\300\003\000\000\252\001\000\000\272\000\000\000\300\003\000\000\200\002\000\000\251\000\000\000\300\003\000\000U\003\000\000\242\000\000\000\300\003\000\000*\004\000\000\223\000\000\000\300\003\000\000\000\005\000\000|\000\000\000\300\003\000\000\325\005\000\000\225\000\000\000\300\003\000\000\252\006\000\000\235\000\000\000\300\003\000\000\200\007\000\000E\000\000\0008\004\000\000\000\000\000\000U\000\000\0008\004\000\000\325\000\000\000F\000\000\0008\004\000\000\252\001\000\000\301\000\000\0008\004\000\000\200\002\000\000\300\000\000\0008\004\000\000U\003\000\000\233\000\000\0008\004\000\000*\004\000\000\207\000\000\0008\004\000\000\000\005\000\000x\000\000\0008\004\000\000\325\005\000\000\212\000\000\0008\004\000\000\252\006\000\000\200\000\000\0008\004\000\000\200\007\000\000F\000\000" + .size .Lb2.data, 1200 + + .type .Lb2.buffer,@object # @b2.buffer + .data + .p2align 4, 0x0 +.Lb2.buffer: + .quad 0 # 0x0 + .quad 0 + .quad .Lb2.data + .quad 1 # 0x1 + .byte 0 # 0x0 + .byte 32 # 0x20 + .half 1 # 0x1 + .word 1 # 0x1 + .quad .Lb2.shape + .quad 0 + .size .Lb2.buffer, 56 + + .type .Lb3.shape,@object # @b3.shape + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lb3.shape: + .asciz "\000\000\000\000d\000\000\000\001\000\000\000\000\000\000" + .size .Lb3.shape, 16 + + .type .Lb3.data,@object # @b3.data + .p2align 5, 0x0 +.Lb3.data: + .ascii "\211\004&=\244\022\233\275\217U%;\272)J>#\026\024\276@_\234>\340\243|\275\334\034K\275\232\300\004:A\313\245\274\303\024r\275\345\313a\275x\231\300>\201\334\"\276\236}\321\275H\346\257\276\202V\310>\225\375(>},\026\274E\235W\275\241\236\236<\245\220\222=\004\361\263\276\2620\240\274\265\036\240=\\:\336=\240\016\263\276\260\224\371\275p\235(=\311\\x;\226y\\\275\360\227\377=\275\364\200=rw/<@U\017=c\327\003=nV\267\273\3543\207\311\274<&{\274h\b\031>\203\222\003\276\213\3649\275d\177\017\276\330\306\246\275'\314:>\203\024\253=S\331M\276\257\263S=b\221q\275\336\t\306=#^\005\2763)\256\274\344\255G>\224!\374=\374\360o\275\215#\031\276\322`J\276y6\360=\273\303;\275N\"N>M\022K\276\016\035C=1Fb\275u\017\220\275\0040\033>,\037\223=t\335\304\275\363\344\271=\320\341\260\275\2265\255=\310\"b=&\031\330\275;\314\000>>\347o\275\255\222\020=C\rU<\370d\307\275\373\306\206=I\345\373\274Y|\034>WY;\276O\240?>B\347\226\275\203\271\203\275W\243\313<\214\364\343\274Ey\033\276J4\214=\261\304\241\274-\271p>X\313l\276*_\366\275\260^\\=\237l1=\371\272\200=\002\373?\274*\3578<]Gv=" + .size .Lb3.data, 400 + + .type .Lb3.buffer,@object # @b3.buffer + .data + .p2align 4, 0x0 +.Lb3.buffer: + .quad 0 # 0x0 + .quad 0 + .quad .Lb3.data + .quad 1 # 0x1 + .byte 2 # 0x2 + .byte 32 # 0x20 + .half 1 # 0x1 + .word 1 # 0x1 + .quad .Lb3.shape + .quad 0 + .size .Lb3.buffer, 56 + + .type .L__unnamed_1,@object # @0 + .section .rodata,"a",@progbits + .p2align 4, 0x0 +.L__unnamed_1: + .zero 32 + .size .L__unnamed_1, 32 + + .type .Lstr,@object # @str + .p2align 5, 0x0 +.Lstr: + .asciz "idw" + .size .Lstr, 4 + + .type .L__unnamed_2,@object # @1 + .section .data.rel.ro,"aw",@progbits + .p2align 4, 0x0 +.L__unnamed_2: + .quad .Lstr + .word 2 # 0x2 + .word 2 # 0x2 + .byte 2 # 0x2 + .byte 32 # 0x20 + .half 1 # 0x1 + .zero 4 + .quad 0 + .quad 0 + .quad 0 + .quad 0 + .quad .L__unnamed_1 + .size .L__unnamed_2, 64 + + .type .Lstr.4,@object # @str.4 + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lstr.4: + .asciz "riscv-64-linux-no_asserts-no_runtime-rvv-vector_bits_128" + .size .Lstr.4, 57 + + .type .Lidw_metadata_storage,@object # @idw_metadata_storage + .section .data.rel.ro,"aw",@progbits + .p2align 4, 0x0 +.Lidw_metadata_storage: + .word 1 # 0x1 + .word 1 # 0x1 + .quad .L__unnamed_2 + .quad .Lstr.4 + .quad .Lstr + .size .Lidw_metadata_storage, 32 + + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .section ".note.GNU-stack","",@progbits diff --git a/aot/idw_halide.h b/aot/idw_halide.h new file mode 100644 index 0000000..e06605e --- /dev/null +++ b/aot/idw_halide.h @@ -0,0 +1,56 @@ +#ifndef HALIDE__idw_halide_h +#define HALIDE__idw_halide_h +#include + +// Forward declarations of the types used in the interface +// to the Halide pipeline. +// +// For the definitions of these structs, include HalideRuntime.h + +// Halide's representation of a multi-dimensional array. +// Halide::Runtime::Buffer is a more user-friendly wrapper +// around this. Its declaration is in HalideBuffer.h +struct halide_buffer_t; + +// Metadata describing the arguments to the generated function. +// Used to construct calls to the _argv version of the function. +struct halide_filter_metadata_t; + +#ifndef HALIDE_MUST_USE_RESULT +#ifdef __has_attribute +#if __has_attribute(nodiscard) +#define HALIDE_MUST_USE_RESULT [[nodiscard]] +#elif __has_attribute(warn_unused_result) +#define HALIDE_MUST_USE_RESULT __attribute__((warn_unused_result)) +#else +#define HALIDE_MUST_USE_RESULT +#endif +#else +#define HALIDE_MUST_USE_RESULT +#endif +#endif + +#ifndef HALIDE_FUNCTION_ATTRS +#define HALIDE_FUNCTION_ATTRS +#endif + + + +#ifdef __cplusplus +extern "C" { +#endif + +HALIDE_FUNCTION_ATTRS +int idw_halide_(struct halide_buffer_t *_idw_halide__buffer); + +HALIDE_FUNCTION_ATTRS +int idw_halide__argv(void **args); + +HALIDE_FUNCTION_ATTRS +const struct halide_filter_metadata_t *idw_halide__metadata(); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/aot/idw_halide.s b/aot/idw_halide.s new file mode 100644 index 0000000..74b4a7e --- /dev/null +++ b/aot/idw_halide.s @@ -0,0 +1,362 @@ + .text + .attribute 4, 16 + .attribute 5, "rv64gcv0p7" + .file "halide_buffer_t.cpp" + .section .text.idw_halide_,"ax",@progbits + .globl idw_halide_ # -- Begin function idw_halide_ + .p2align 1 + .type idw_halide_,@function +idw_halide_: # @idw_halide_ +# %bb.0: # %entry + addi sp, sp, -96 + sd ra, 88(sp) # 8-byte Folded Spill + sd s0, 80(sp) # 8-byte Folded Spill + sd s1, 72(sp) # 8-byte Folded Spill + sd s2, 64(sp) # 8-byte Folded Spill + sd s3, 56(sp) # 8-byte Folded Spill + sd s4, 48(sp) # 8-byte Folded Spill + sd s5, 40(sp) # 8-byte Folded Spill + sd s6, 32(sp) # 8-byte Folded Spill + sd s7, 24(sp) # 8-byte Folded Spill + sd s8, 16(sp) # 8-byte Folded Spill + sd s9, 8(sp) # 8-byte Folded Spill + sd s10, 0(sp) # 8-byte Folded Spill +.Lpcrel_hi0: + auipc a1, %pcrel_hi(.Lb2.buffer) + addi a1, a1, %pcrel_lo(.Lpcrel_hi0) + ld s8, 16(a1) + ld a3, 40(a0) +.Lpcrel_hi1: + auipc a2, %pcrel_hi(.Lb3.buffer) + addi a2, a2, %pcrel_lo(.Lpcrel_hi1) + ld s9, 16(a2) + lw s3, 0(a3) + lw s7, 4(a3) + lw s4, 16(a3) + ld a4, 0(a1) + lwu s5, 20(a3) + lw s6, 24(a3) + or a5, s8, a4 + mv a4, s9 + beqz a5, .LBB0_4 +# %bb.1: # %after_bb + ld a5, 0(a2) + ld s10, 16(a0) + or a4, a4, a5 + beqz a4, .LBB0_5 +.LBB0_2: # %after_bb1 + sext.w a4, s5 + beqz s10, .LBB0_6 +.LBB0_3: + li a0, 0 + j .LBB0_9 +.LBB0_4: # %then_bb + lui a4, 128 + addiw a4, a4, 9 + slli a4, a4, 13 + sd a4, 32(a1) + ld a4, 40(a1) + sd zero, 0(a1) + sd zero, 8(a1) + sd zero, 16(a1) + sw zero, 0(a4) + li a5, 300 + sw a5, 4(a4) + li a5, 1 + sw a5, 8(a4) + sw zero, 12(a4) + ld a4, 16(a2) + sd zero, 24(a1) + ld a5, 0(a2) + ld s10, 16(a0) + or a4, a4, a5 + bnez a4, .LBB0_2 +.LBB0_5: # %then_bb2 + sd zero, 16(a2) + sd zero, 8(a2) + sd zero, 0(a2) + lui a4, 128 + addiw a4, a4, 9 + ld a5, 40(a2) + slli a4, a4, 13 + addi a4, a4, 2 + sd a4, 32(a2) + sw zero, 0(a5) + li a4, 100 + sw a4, 4(a5) + li a4, 1 + sw a4, 8(a5) + sw zero, 12(a5) + sd zero, 24(a2) + sext.w a4, s5 + bnez s10, .LBB0_3 +.LBB0_6: # %_halide_buffer_is_bounds_query.exit36 + ld a5, 0(a0) + bnez a5, .LBB0_8 +# %bb.7: # %then_bb5 + sd zero, 16(a0) + sd zero, 8(a0) + sd zero, 0(a0) + lui a5, 256 + addiw a5, a5, 9 + slli a5, a5, 13 + addi a5, a5, 2 + sd a5, 32(a0) + sw s3, 0(a3) + sw s7, 4(a3) + li a5, 1 + sw a5, 8(a3) + sw zero, 12(a3) + sw s4, 16(a3) + sw a4, 20(a3) + sw s7, 24(a3) + sw zero, 28(a3) + sd zero, 24(a0) +.LBB0_8: # %land.rhs.i49 + ld a0, 0(a0) + seqz a0, a0 +.LBB0_9: # %_halide_buffer_is_bounds_query.exit50 + ld a3, 16(a1) + ld a1, 0(a1) + ld a5, 16(a2) + ld a2, 0(a2) + or a1, a1, a3 + seqz a1, a1 + or a2, a2, a5 + seqz a2, a2 + or a1, a1, a2 + or a0, a0, a1 + slti a1, a4, 1 + or a0, a0, a1 + bnez a0, .LBB0_20 +# %bb.10: # %"for idw_halide_.s0.y.rebased.preheader" + blez s7, .LBB0_20 +# %bb.11: # %"for idw_halide_.s0.y.rebased.us.preheader" + li s0, 0 + slli s2, s7, 2 + mv s1, s5 +.LBB0_12: # %"for idw_halide_.s0.y.rebased.us" + # =>This Inner Loop Header: Depth=1 + slli a0, s0, 2 + add a0, a0, s10 + li a1, 0 + mv a2, s2 + call memset@plt + addi s1, s1, -1 + addw s0, s0, s6 + bnez s1, .LBB0_12 +# %bb.13: # %"for idw_halide_.s1.y.rebased.preheader" + blez s7, .LBB0_20 +# %bb.14: # %"for idw_halide_.s1.y.rebased.us.preheader" + li a6, 0 + addi s8, s8, 4 +.LBB0_15: # %"for idw_halide_.s1.y.rebased.us" + # =>This Loop Header: Depth=1 + # Child Loop BB0_16 Depth 2 + # Child Loop BB0_17 Depth 3 + li a1, 0 + mulw a7, s6, a6 + add a3, s4, a6 +.LBB0_16: # %"for idw_halide_.s1.x.rebased.us" + # Parent Loop BB0_15 Depth=1 + # => This Loop Header: Depth=2 + # Child Loop BB0_17 Depth 3 + add a0, a1, a7 + slli a0, a0, 2 + add t0, s10, a0 + flw ft0, 0(t0) + add a5, s3, a1 + li a0, 100 + mv s1, s8 + mv s0, s9 +.LBB0_17: # %"for idw_halide_.s1.r4$x.us" + # Parent Loop BB0_15 Depth=1 + # Parent Loop BB0_16 Depth=2 + # => This Inner Loop Header: Depth=3 + lw a2, 0(s1) + lw a4, -4(s1) + subw a2, a5, a2 + subw a4, a3, a4 + mulw a2, a2, a2 + mulw a4, a4, a4 + flw ft1, 0(s0) + add a2, a2, a4 + fcvt.s.w ft2, a2 + fsqrt.s ft2, ft2 + fmadd.s ft0, ft2, ft1, ft0 + addi a0, a0, -1 + addi s0, s0, 4 + addi s1, s1, 12 + bnez a0, .LBB0_17 +# %bb.18: # %"end for idw_halide_.s1.r4$x.us" + # in Loop: Header=BB0_16 Depth=2 + addi a1, a1, 1 + fsw ft0, 0(t0) + bne a1, s7, .LBB0_16 +# %bb.19: # %"end for idw_halide_.s1.x.rebased.loopexit.us" + # in Loop: Header=BB0_15 Depth=1 + addi a6, a6, 1 + bne a6, s5, .LBB0_15 +.LBB0_20: # %destructor_block + li a0, 0 + ld ra, 88(sp) # 8-byte Folded Reload + ld s0, 80(sp) # 8-byte Folded Reload + ld s1, 72(sp) # 8-byte Folded Reload + ld s2, 64(sp) # 8-byte Folded Reload + ld s3, 56(sp) # 8-byte Folded Reload + ld s4, 48(sp) # 8-byte Folded Reload + ld s5, 40(sp) # 8-byte Folded Reload + ld s6, 32(sp) # 8-byte Folded Reload + ld s7, 24(sp) # 8-byte Folded Reload + ld s8, 16(sp) # 8-byte Folded Reload + ld s9, 8(sp) # 8-byte Folded Reload + ld s10, 0(sp) # 8-byte Folded Reload + addi sp, sp, 96 + ret +.Lfunc_end0: + .size idw_halide_, .Lfunc_end0-idw_halide_ + # -- End function + .section .text.idw_halide__argv,"ax",@progbits + .globl idw_halide__argv # -- Begin function idw_halide__argv + .p2align 1 + .type idw_halide__argv,@function +idw_halide__argv: # @idw_halide__argv +# %bb.0: # %entry + addi sp, sp, -16 + sd ra, 8(sp) # 8-byte Folded Spill + ld a0, 0(a0) + call idw_halide_@plt + li a0, 0 + ld ra, 8(sp) # 8-byte Folded Reload + addi sp, sp, 16 + ret +.Lfunc_end1: + .size idw_halide__argv, .Lfunc_end1-idw_halide__argv + # -- End function + .section .text.idw_halide__metadata,"ax",@progbits + .globl idw_halide__metadata # -- Begin function idw_halide__metadata + .p2align 1 + .type idw_halide__metadata,@function +idw_halide__metadata: # @idw_halide__metadata +# %bb.0: # %entry +.Lpcrel_hi2: + auipc a0, %pcrel_hi(.Lidw_halide__metadata_storage) + addi a0, a0, %pcrel_lo(.Lpcrel_hi2) + ret +.Lfunc_end2: + .size idw_halide__metadata, .Lfunc_end2-idw_halide__metadata + # -- End function + .type .Lb2.shape,@object # @b2.shape + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lb2.shape: + .asciz "\000\000\000\000,\001\000\000\001\000\000\000\000\000\000" + .size .Lb2.shape, 16 + + .type .Lb2.data,@object # @b2.data + .p2align 5, 0x0 +.Lb2.data: + .asciz "\000\000\000\000\000\000\000\000H\000\000\000\000\000\000\000\325\000\000\000O\000\000\000\000\000\000\000\252\001\000\000<\000\000\000\000\000\000\000\200\002\000\000L\000\000\000\000\000\000\000U\003\000\000\200\000\000\000\000\000\000\000*\004\000\000C\000\000\000\000\000\000\000\000\005\000\000A\000\000\000\000\000\000\000\325\005\000\000@\000\000\000\000\000\000\000\252\006\000\000<\000\000\000\000\000\000\000\200\007\000\000=\000\000\000x\000\000\000\000\000\000\000Q\000\000\000x\000\000\000\325\000\000\000O\000\000\000x\000\000\000\252\001\000\000:\000\000\000x\000\000\000\200\002\000\000\204\000\000\000x\000\000\000U\003\000\000\225\000\000\000x\000\000\000*\004\000\000\216\000\000\000x\000\000\000\000\005\000\000@\000\000\000x\000\000\000\325\005\000\000E\000\000\000x\000\000\000\252\006\000\000A\000\000\000x\000\000\000\200\007\000\000@\000\000\000\360\000\000\000\000\000\000\000K\000\000\000\360\000\000\000\325\000\000\000D\000\000\000\360\000\000\000\252\001\000\000\214\000\000\000\360\000\000\000\200\002\000\000\231\000\000\000\360\000\000\000U\003\000\000\221\000\000\000\360\000\000\000*\004\000\000\204\000\000\000\360\000\000\000\000\005\000\000\230\000\000\000\360\000\000\000\325\005\000\000}\000\000\000\360\000\000\000\252\006\000\000B\000\000\000\360\000\000\000\200\007\000\000:\000\000\000h\001\000\000\000\000\000\000N\000\000\000h\001\000\000\325\000\000\000<\000\000\000h\001\000\000\252\001\000\000\213\000\000\000h\001\000\000\200\002\000\000\250\000\000\000h\001\000\000U\003\000\000\232\000\000\000h\001\000\000*\004\000\000\212\000\000\000h\001\000\000\000\005\000\000\221\000\000\000h\001\000\000\325\005\000\000\240\000\000\000h\001\000\000\252\006\000\000D\000\000\000h\001\000\000\200\007\000\000<\000\000\000\340\001\000\000\000\000\000\000M\000\000\000\340\001\000\000\325\000\000\000;\000\000\000\340\001\000\000\252\001\000\000\245\000\000\000\340\001\000\000\200\002\000\000\267\000\000\000\340\001\000\000U\003\000\000\246\000\000\000\340\001\000\000*\004\000\000\216\000\000\000\340\001\000\000\000\005\000\000{\000\000\000\340\001\000\000\325\005\000\000\233\000\000\000\340\001\000\000\252\006\000\000\220\000\000\000\340\001\000\000\200\007\000\000<\000\000\000X\002\000\000\000\000\000\000S\000\000\000X\002\000\000\325\000\000\000A\000\000\000X\002\000\000\252\001\000\000\262\000\000\000X\002\000\000\200\002\000\000\270\000\000\000X\002\000\000U\003\000\000\212\000\000\000X\002\000\000*\004\000\000|\000\000\000X\002\000\000\000\005\000\000\204\000\000\000X\002\000\000\325\005\000\000\257\000\000\000X\002\000\000\252\006\000\000\257\000\000\000X\002\000\000\200\007\000\000<\000\000\000\320\002\000\000\000\000\000\000V\000\000\000\320\002\000\000\325\000\000\000<\000\000\000\320\002\000\000\252\001\000\000\266\000\000\000\320\002\000\000\200\002\000\000\263\000\000\000\320\002\000\000U\003\000\000\230\000\000\000\320\002\000\000*\004\000\000\214\000\000\000\320\002\000\000\000\005\000\000s\000\000\000\320\002\000\000\325\005\000\000\234\000\000\000\320\002\000\000\252\006\000\000\254\000\000\000\320\002\000\000\200\007\000\000A\000\000\000H\003\000\000\000\000\000\000Z\000\000\000H\003\000\000\325\000\000\000E\000\000\000H\003\000\000\252\001\000\000\244\000\000\000H\003\000\000\200\002\000\000\272\000\000\000H\003\000\000U\003\000\000\222\000\000\000H\003\000\000*\004\000\000\223\000\000\000H\003\000\000\000\005\000\000z\000\000\000H\003\000\000\325\005\000\000\230\000\000\000H\003\000\000\252\006\000\000\246\000\000\000H\003\000\000\200\007\000\000E\000\000\000\300\003\000\000\000\000\000\000U\000\000\000\300\003\000\000\325\000\000\000C\000\000\000\300\003\000\000\252\001\000\000\272\000\000\000\300\003\000\000\200\002\000\000\251\000\000\000\300\003\000\000U\003\000\000\242\000\000\000\300\003\000\000*\004\000\000\223\000\000\000\300\003\000\000\000\005\000\000|\000\000\000\300\003\000\000\325\005\000\000\225\000\000\000\300\003\000\000\252\006\000\000\235\000\000\000\300\003\000\000\200\007\000\000E\000\000\0008\004\000\000\000\000\000\000U\000\000\0008\004\000\000\325\000\000\000F\000\000\0008\004\000\000\252\001\000\000\301\000\000\0008\004\000\000\200\002\000\000\300\000\000\0008\004\000\000U\003\000\000\233\000\000\0008\004\000\000*\004\000\000\207\000\000\0008\004\000\000\000\005\000\000x\000\000\0008\004\000\000\325\005\000\000\212\000\000\0008\004\000\000\252\006\000\000\200\000\000\0008\004\000\000\200\007\000\000F\000\000" + .size .Lb2.data, 1200 + + .type .Lb2.buffer,@object # @b2.buffer + .data + .p2align 4, 0x0 +.Lb2.buffer: + .quad 0 # 0x0 + .quad 0 + .quad .Lb2.data + .quad 1 # 0x1 + .byte 0 # 0x0 + .byte 32 # 0x20 + .half 1 # 0x1 + .word 1 # 0x1 + .quad .Lb2.shape + .quad 0 + .size .Lb2.buffer, 56 + + .type .Lb3.shape,@object # @b3.shape + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lb3.shape: + .asciz "\000\000\000\000d\000\000\000\001\000\000\000\000\000\000" + .size .Lb3.shape, 16 + + .type .Lb3.data,@object # @b3.data + .p2align 5, 0x0 +.Lb3.data: + .ascii "\211\004&=\244\022\233\275\217U%;\272)J>#\026\024\276@_\234>\340\243|\275\334\034K\275\232\300\004:A\313\245\274\303\024r\275\345\313a\275x\231\300>\201\334\"\276\236}\321\275H\346\257\276\202V\310>\225\375(>},\026\274E\235W\275\241\236\236<\245\220\222=\004\361\263\276\2620\240\274\265\036\240=\\:\336=\240\016\263\276\260\224\371\275p\235(=\311\\x;\226y\\\275\360\227\377=\275\364\200=rw/<@U\017=c\327\003=nV\267\273\3543\207\311\274<&{\274h\b\031>\203\222\003\276\213\3649\275d\177\017\276\330\306\246\275'\314:>\203\024\253=S\331M\276\257\263S=b\221q\275\336\t\306=#^\005\2763)\256\274\344\255G>\224!\374=\374\360o\275\215#\031\276\322`J\276y6\360=\273\303;\275N\"N>M\022K\276\016\035C=1Fb\275u\017\220\275\0040\033>,\037\223=t\335\304\275\363\344\271=\320\341\260\275\2265\255=\310\"b=&\031\330\275;\314\000>>\347o\275\255\222\020=C\rU<\370d\307\275\373\306\206=I\345\373\274Y|\034>WY;\276O\240?>B\347\226\275\203\271\203\275W\243\313<\214\364\343\274Ey\033\276J4\214=\261\304\241\274-\271p>X\313l\276*_\366\275\260^\\=\237l1=\371\272\200=\002\373?\274*\3578<]Gv=" + .size .Lb3.data, 400 + + .type .Lb3.buffer,@object # @b3.buffer + .data + .p2align 4, 0x0 +.Lb3.buffer: + .quad 0 # 0x0 + .quad 0 + .quad .Lb3.data + .quad 1 # 0x1 + .byte 2 # 0x2 + .byte 32 # 0x20 + .half 1 # 0x1 + .word 1 # 0x1 + .quad .Lb3.shape + .quad 0 + .size .Lb3.buffer, 56 + + .type .L__unnamed_1,@object # @0 + .section .rodata,"a",@progbits + .p2align 4, 0x0 +.L__unnamed_1: + .zero 32 + .size .L__unnamed_1, 32 + + .type .Lstr,@object # @str + .p2align 5, 0x0 +.Lstr: + .asciz "idw_halide_" + .size .Lstr, 12 + + .type .L__unnamed_2,@object # @1 + .section .data.rel.ro,"aw",@progbits + .p2align 4, 0x0 +.L__unnamed_2: + .quad .Lstr + .word 2 # 0x2 + .word 2 # 0x2 + .byte 2 # 0x2 + .byte 32 # 0x20 + .half 1 # 0x1 + .zero 4 + .quad 0 + .quad 0 + .quad 0 + .quad 0 + .quad .L__unnamed_1 + .size .L__unnamed_2, 64 + + .type .Lstr.4,@object # @str.4 + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lstr.4: + .asciz "riscv-64-linux-no_asserts-no_runtime-rvv-vector_bits_128" + .size .Lstr.4, 57 + + .type .Lidw_halide__metadata_storage,@object # @idw_halide__metadata_storage + .section .data.rel.ro,"aw",@progbits + .p2align 4, 0x0 +.Lidw_halide__metadata_storage: + .word 1 # 0x1 + .word 1 # 0x1 + .quad .L__unnamed_2 + .quad .Lstr.4 + .quad .Lstr + .size .Lidw_halide__metadata_storage, 32 + + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .section ".note.GNU-stack","",@progbits diff --git a/aot/idw_halide_parallel.h b/aot/idw_halide_parallel.h new file mode 100644 index 0000000..6f272cb --- /dev/null +++ b/aot/idw_halide_parallel.h @@ -0,0 +1,56 @@ +#ifndef HALIDE__idw_halide_parallel_h +#define HALIDE__idw_halide_parallel_h +#include + +// Forward declarations of the types used in the interface +// to the Halide pipeline. +// +// For the definitions of these structs, include HalideRuntime.h + +// Halide's representation of a multi-dimensional array. +// Halide::Runtime::Buffer is a more user-friendly wrapper +// around this. Its declaration is in HalideBuffer.h +struct halide_buffer_t; + +// Metadata describing the arguments to the generated function. +// Used to construct calls to the _argv version of the function. +struct halide_filter_metadata_t; + +#ifndef HALIDE_MUST_USE_RESULT +#ifdef __has_attribute +#if __has_attribute(nodiscard) +#define HALIDE_MUST_USE_RESULT [[nodiscard]] +#elif __has_attribute(warn_unused_result) +#define HALIDE_MUST_USE_RESULT __attribute__((warn_unused_result)) +#else +#define HALIDE_MUST_USE_RESULT +#endif +#else +#define HALIDE_MUST_USE_RESULT +#endif +#endif + +#ifndef HALIDE_FUNCTION_ATTRS +#define HALIDE_FUNCTION_ATTRS +#endif + + + +#ifdef __cplusplus +extern "C" { +#endif + +HALIDE_FUNCTION_ATTRS +int idw_halide_parallel_(struct halide_buffer_t *_idw_halide_parallel__buffer); + +HALIDE_FUNCTION_ATTRS +int idw_halide_parallel__argv(void **args); + +HALIDE_FUNCTION_ATTRS +const struct halide_filter_metadata_t *idw_halide_parallel__metadata(); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/aot/idw_halide_parallel.s b/aot/idw_halide_parallel.s new file mode 100644 index 0000000..7578098 --- /dev/null +++ b/aot/idw_halide_parallel.s @@ -0,0 +1,598 @@ + .text + .attribute 4, 16 + .attribute 5, "rv64gcv0p7" + .file "halide_buffer_t.cpp" + .section .text.idw_halide_parallel__par_for_idw_halide_parallel__s0_x_x_outer_tile_index__1,"ax",@progbits + .globl idw_halide_parallel__par_for_idw_halide_parallel__s0_x_x_outer_tile_index__1 # -- Begin function idw_halide_parallel__par_for_idw_halide_parallel__s0_x_x_outer_tile_index__1 + .p2align 1 + .type idw_halide_parallel__par_for_idw_halide_parallel__s0_x_x_outer_tile_index__1,@function +idw_halide_parallel__par_for_idw_halide_parallel__s0_x_x_outer_tile_index__1: # @idw_halide_parallel__par_for_idw_halide_parallel__s0_x_x_outer_tile_index__1 +# %bb.0: # %entry + addi sp, sp, -48 + sd ra, 40(sp) # 8-byte Folded Spill + sd s0, 32(sp) # 8-byte Folded Spill + sd s1, 24(sp) # 8-byte Folded Spill + sd s2, 16(sp) # 8-byte Folded Spill + sd s3, 8(sp) # 8-byte Folded Spill + lw a4, 24(a2) + lw a7, 8(a2) + lw a6, 12(a2) + sraiw a3, a1, 31 + seqz a5, a4 + negw a0, a5 + srai s1, a4, 31 + subw t0, a1, a3 + or a0, a0, a4 + remw a1, t0, a0 + xor s0, s1, a4 + not a0, s1 + add s0, s0, a0 + and s0, s0, a3 + add a1, a1, s0 + li s0, 480 + mulw a1, a1, s0 + addi s0, a5, -1 + and t2, s0, a1 + addiw a1, a7, -480 + blt t2, a1, .LBB0_2 +# %bb.1: # %entry + mv t2, a1 +.LBB0_2: # %entry + lw a7, 16(a2) + lw s2, 20(a2) + lw t1, 28(a2) + add a4, a4, a5 + divw a1, t0, a4 + subw a0, a0, s1 + and a0, a0, a3 + add a0, a0, a1 + li s1, 270 + mulw a0, a0, s1 + and a0, a0, s0 + addiw a1, a6, -270 + blt a0, a1, .LBB0_4 +# %bb.3: # %entry + mv a0, a1 +.LBB0_4: # %entry + ld s3, 0(a2) + add a0, a0, a7 + add t1, t1, t2 + mulw a0, s2, a0 + addw s0, t1, a0 +.LBB0_5: # %"for idw_halide_parallel_.s0.y.y_inner" + # =>This Inner Loop Header: Depth=1 + slli a0, s0, 2 + add a0, a0, s3 + li a2, 1920 + li a1, 0 + call memset@plt + addi s1, s1, -1 + addw s0, s0, s2 + bnez s1, .LBB0_5 +# %bb.6: # %destructor_block + li a0, 0 + ld ra, 40(sp) # 8-byte Folded Reload + ld s0, 32(sp) # 8-byte Folded Reload + ld s1, 24(sp) # 8-byte Folded Reload + ld s2, 16(sp) # 8-byte Folded Reload + ld s3, 8(sp) # 8-byte Folded Reload + addi sp, sp, 48 + ret +.Lfunc_end0: + .size idw_halide_parallel__par_for_idw_halide_parallel__s0_x_x_outer_tile_index__1, .Lfunc_end0-idw_halide_parallel__par_for_idw_halide_parallel__s0_x_x_outer_tile_index__1 + # -- End function + .section .text.idw_halide_parallel__par_for_idw_halide_parallel__s1_x_x_outer_tile_index__1,"ax",@progbits + .globl idw_halide_parallel__par_for_idw_halide_parallel__s1_x_x_outer_tile_index__1 # -- Begin function idw_halide_parallel__par_for_idw_halide_parallel__s1_x_x_outer_tile_index__1 + .p2align 1 + .type idw_halide_parallel__par_for_idw_halide_parallel__s1_x_x_outer_tile_index__1,@function +idw_halide_parallel__par_for_idw_halide_parallel__s1_x_x_outer_tile_index__1: # @idw_halide_parallel__par_for_idw_halide_parallel__s1_x_x_outer_tile_index__1 +# %bb.0: # %entry + addi sp, sp, -48 + sd s0, 40(sp) # 8-byte Folded Spill + sd s1, 32(sp) # 8-byte Folded Spill + sd s2, 24(sp) # 8-byte Folded Spill + sd s3, 16(sp) # 8-byte Folded Spill + sd s4, 8(sp) # 8-byte Folded Spill + sd s5, 0(sp) # 8-byte Folded Spill + li a7, 0 + ld s3, 0(a2) + ld t4, 8(a2) + ld t3, 16(a2) + lw a0, 36(a2) + lw t5, 24(a2) + lw t1, 28(a2) + sraiw a3, a1, 31 + seqz a4, a0 + negw a5, a4 + srai s1, a0, 31 + subw a1, a1, a3 + or a5, a5, a0 + remw a6, a1, a5 + xor s0, s1, a0 + not a5, s1 + add s0, s0, a5 + and s0, s0, a3 + add s0, s0, a6 + lw a6, 32(a2) + lw t6, 40(a2) + add a0, a0, a4 + divw a0, a1, a0 + subw a5, a5, s1 + and a3, a3, a5 + add a0, a0, a3 + li t2, 480 + mulw a1, s0, t2 + addi a4, a4, -1 + and s2, a4, a1 + li t0, 270 + mulw a0, a0, t0 + and a0, a0, a4 + add t1, t1, a0 + addi s3, s3, 4 +.LBB1_1: # %"for idw_halide_parallel_.s1.y.y_inner" + # =>This Loop Header: Depth=1 + # Child Loop BB1_2 Depth 2 + # Child Loop BB1_3 Depth 3 + li a4, 0 + add a3, t1, a7 + mulw s4, a3, a6 +.LBB1_2: # %"for idw_halide_parallel_.s1.x.x_inner" + # Parent Loop BB1_1 Depth=1 + # => This Loop Header: Depth=2 + # Child Loop BB1_3 Depth 3 + add s0, s2, a4 + add a0, s0, t6 + addw a0, a0, s4 + slli a0, a0, 2 + add s5, t3, a0 + flw ft0, 0(s5) + add s0, s0, t5 + li s1, 100 + mv a1, s3 + mv a0, t4 +.LBB1_3: # %"for idw_halide_parallel_.s1.r13$x" + # Parent Loop BB1_1 Depth=1 + # Parent Loop BB1_2 Depth=2 + # => This Inner Loop Header: Depth=3 + lw a5, 0(a1) + lw a2, -4(a1) + subw a5, s0, a5 + subw a2, a3, a2 + mulw a5, a5, a5 + mulw a2, a2, a2 + flw ft1, 0(a0) + add a2, a2, a5 + fcvt.s.w ft2, a2 + fsqrt.s ft2, ft2 + fmadd.s ft0, ft2, ft1, ft0 + addi s1, s1, -1 + addi a0, a0, 4 + addi a1, a1, 12 + bnez s1, .LBB1_3 +# %bb.4: # %"end for idw_halide_parallel_.s1.r13$x" + # in Loop: Header=BB1_2 Depth=2 + addi a4, a4, 1 + fsw ft0, 0(s5) + bne a4, t2, .LBB1_2 +# %bb.5: # %"end for idw_halide_parallel_.s1.x.x_inner" + # in Loop: Header=BB1_1 Depth=1 + addiw a7, a7, 1 + bne a7, t0, .LBB1_1 +# %bb.6: # %destructor_block + li a0, 0 + ld s0, 40(sp) # 8-byte Folded Reload + ld s1, 32(sp) # 8-byte Folded Reload + ld s2, 24(sp) # 8-byte Folded Reload + ld s3, 16(sp) # 8-byte Folded Reload + ld s4, 8(sp) # 8-byte Folded Reload + ld s5, 0(sp) # 8-byte Folded Reload + addi sp, sp, 48 + ret +.Lfunc_end1: + .size idw_halide_parallel__par_for_idw_halide_parallel__s1_x_x_outer_tile_index__1, .Lfunc_end1-idw_halide_parallel__par_for_idw_halide_parallel__s1_x_x_outer_tile_index__1 + # -- End function + .section .text.idw_halide_parallel_,"ax",@progbits + .globl idw_halide_parallel_ # -- Begin function idw_halide_parallel_ + .p2align 1 + .type idw_halide_parallel_,@function +idw_halide_parallel_: # @idw_halide_parallel_ +# %bb.0: # %entry + addi sp, sp, -176 + sd ra, 168(sp) # 8-byte Folded Spill + sd s0, 160(sp) # 8-byte Folded Spill + sd s1, 152(sp) # 8-byte Folded Spill + sd s2, 144(sp) # 8-byte Folded Spill + sd s3, 136(sp) # 8-byte Folded Spill + sd s4, 128(sp) # 8-byte Folded Spill + sd s5, 120(sp) # 8-byte Folded Spill + sd s6, 112(sp) # 8-byte Folded Spill + sd s7, 104(sp) # 8-byte Folded Spill + sd s8, 96(sp) # 8-byte Folded Spill + sd s9, 88(sp) # 8-byte Folded Spill + ld t5, 40(a0) + lw a4, 4(t5) + lw t2, 20(t5) + li a2, -480 + subw a6, a2, a4 + addiw a2, a4, -1 + blt a2, a6, .LBB2_2 +# %bb.1: # %entry + mv a6, a2 +.LBB2_2: # %entry +.Lpcrel_hi0: + auipc a2, %pcrel_hi(.Lb24.buffer) + addi a3, a2, %pcrel_lo(.Lpcrel_hi0) + sgtz t4, a4 + slti a5, a4, -479 + slti s0, t2, -269 + lui s1, 559241 + li a7, 480 + addiw t1, s1, -1911 + mv t0, a4 + blt a4, a7, .LBB2_4 +# %bb.3: # %entry + li t0, 480 +.LBB2_4: # %entry +.Lpcrel_hi1: + auipc t6, %pcrel_hi(.Lb25.buffer) + ld s2, 16(a3) + add a1, a4, a5 + addiw a1, a1, 479 + mul a2, a1, t1 + srli a2, a2, 32 + add a1, a1, a2 + srliw a2, a1, 31 + sraiw a1, a1, 8 + add a1, a1, a2 + subw s9, a1, a5 + add a1, t2, s0 + addiw a1, a1, 269 + lui a2, 994205 + addiw a2, a2, 1609 + mul a2, a1, a2 + srli a2, a2, 32 + add a1, a1, a2 + srliw a2, a1, 31 + sraiw a1, a1, 8 + add a1, a1, a2 + subw a1, a1, s0 + mulw s7, a1, s9 + slti a1, s7, 1 + negw a2, a1 + seqz s1, s9 + add a1, a1, s7 + addiw a1, a1, -1 + add s0, s9, s1 + divw a1, a1, s0 + add a5, a5, a5 + addiw a5, a5, -1 + and a2, a2, a5 + add a1, a1, a2 + li a2, 270 + mulw a1, a1, a2 + addi s1, s1, -1 + and t3, s1, a1 + addi t4, t4, -1 + and t4, t4, t3 + addiw a2, t2, -270 + addi a5, t6, %pcrel_lo(.Lpcrel_hi1) + blt t4, a2, .LBB2_6 +# %bb.5: # %entry + mv t4, a2 +.LBB2_6: # %entry + ld s4, 16(a5) + lw s3, 0(t5) + ld a1, 0(a3) + lw s6, 16(t5) + lw s5, 24(t5) + or a1, s2, a1 + mv s0, s4 + beqz a1, .LBB2_10 +# %bb.7: # %after_bb + ld a1, 0(a5) + ld s8, 16(a0) + or a1, a1, s0 + beqz a1, .LBB2_11 +.LBB2_8: # %after_bb1 + beqz s8, .LBB2_12 +.LBB2_9: + li a0, 0 + j .LBB2_18 +.LBB2_10: # %then_bb + lui a1, 128 + addiw a1, a1, 9 + slli a1, a1, 13 + sd a1, 32(a3) + ld a1, 40(a3) + sd zero, 0(a3) + sd zero, 8(a3) + sd zero, 16(a3) + sw zero, 0(a1) + li a2, 300 + sw a2, 4(a1) + li a2, 1 + sw a2, 8(a1) + sw zero, 12(a1) + ld s0, 16(a5) + sd zero, 24(a3) + ld a1, 0(a5) + ld s8, 16(a0) + or a1, a1, s0 + bnez a1, .LBB2_8 +.LBB2_11: # %then_bb2 + sd zero, 16(a5) + sd zero, 8(a5) + sd zero, 0(a5) + lui a1, 128 + addiw a1, a1, 9 + ld a2, 40(a5) + slli a1, a1, 13 + addi a1, a1, 2 + sd a1, 32(a5) + sw zero, 0(a2) + li a1, 100 + sw a1, 4(a2) + li a1, 1 + sw a1, 8(a2) + sw zero, 12(a2) + sd zero, 24(a5) + bnez s8, .LBB2_9 +.LBB2_12: # %_halide_buffer_is_bounds_query.exit31 + ld a1, 0(a0) + bnez a1, .LBB2_17 +# %bb.13: # %then_bb5 + srai a1, a6, 31 + subw a2, a6, a1 + mul s1, a2, t1 + srli s1, s1, 32 + add a2, a2, s1 + srliw s1, a2, 31 + sraiw a2, a2, 8 + addw a2, a2, s1 + add a1, a1, a2 + sgtz a2, a1 + negw a2, a2 + and a1, a1, a2 + mulw a1, a1, a7 + subw a1, a1, t0 + add t0, t0, s3 + addiw t0, t0, -480 + addiw a6, a1, 960 + add a7, t4, s6 + bgtz a4, .LBB2_15 +# %bb.14: # %then_bb5 + li s0, 270 + j .LBB2_16 +.LBB2_15: + addi s0, t3, 270 +.LBB2_16: # %then_bb5 + subw a1, s0, t4 + sd zero, 16(a0) + sd zero, 8(a0) + sd zero, 0(a0) + lui a2, 256 + addiw a2, a2, 9 + slli a2, a2, 13 + addi a2, a2, 2 + sd a2, 32(a0) + sw t0, 0(t5) + sw a6, 4(t5) + li a2, 1 + sw a2, 8(t5) + sw zero, 12(t5) + sw a7, 16(t5) + sw a1, 20(t5) + sw a6, 24(t5) + sw zero, 28(t5) + sd zero, 24(a0) +.LBB2_17: # %land.rhs.i44 + ld a0, 0(a0) + seqz a0, a0 +.LBB2_18: # %_halide_buffer_is_bounds_query.exit45 + ld a1, 16(a3) + ld a2, 0(a3) + ld a3, 16(a5) + ld a5, 0(a5) + or a1, a1, a2 + seqz a1, a1 + or a3, a3, a5 + seqz a2, a3 + or a1, a1, a2 + or a0, a0, a1 + bnez a0, .LBB2_20 +# %bb.19: # %then_bb8 + mulw a0, s6, s5 + negw s0, a0 + sd s8, 8(sp) + sw a4, 16(sp) + sw t2, 20(sp) + sw s6, 24(sp) + sw s5, 28(sp) + sw s9, 32(sp) + sw s0, 36(sp) +.Lpcrel_hi2: + auipc a0, %got_pcrel_hi(idw_halide_parallel__par_for_idw_halide_parallel__s0_x_x_outer_tile_index__1) + ld a1, %pcrel_lo(.Lpcrel_hi2)(a0) + addi a4, sp, 8 + li a0, 0 + li a2, 0 + mv a3, s7 + call halide_do_par_for@plt + sd s2, 40(sp) + sd s4, 48(sp) + sd s8, 56(sp) + sw s3, 64(sp) + sw s6, 68(sp) + sw s5, 72(sp) + sw s9, 76(sp) + sw s0, 80(sp) +.Lpcrel_hi3: + auipc a0, %got_pcrel_hi(idw_halide_parallel__par_for_idw_halide_parallel__s1_x_x_outer_tile_index__1) + ld a1, %pcrel_lo(.Lpcrel_hi3)(a0) + addi a4, sp, 40 + li a0, 0 + li a2, 0 + mv a3, s7 + call halide_do_par_for@plt +.LBB2_20: # %destructor_block + li a0, 0 + ld ra, 168(sp) # 8-byte Folded Reload + ld s0, 160(sp) # 8-byte Folded Reload + ld s1, 152(sp) # 8-byte Folded Reload + ld s2, 144(sp) # 8-byte Folded Reload + ld s3, 136(sp) # 8-byte Folded Reload + ld s4, 128(sp) # 8-byte Folded Reload + ld s5, 120(sp) # 8-byte Folded Reload + ld s6, 112(sp) # 8-byte Folded Reload + ld s7, 104(sp) # 8-byte Folded Reload + ld s8, 96(sp) # 8-byte Folded Reload + ld s9, 88(sp) # 8-byte Folded Reload + addi sp, sp, 176 + ret +.Lfunc_end2: + .size idw_halide_parallel_, .Lfunc_end2-idw_halide_parallel_ + # -- End function + .section .text.idw_halide_parallel__argv,"ax",@progbits + .globl idw_halide_parallel__argv # -- Begin function idw_halide_parallel__argv + .p2align 1 + .type idw_halide_parallel__argv,@function +idw_halide_parallel__argv: # @idw_halide_parallel__argv +# %bb.0: # %entry + addi sp, sp, -16 + sd ra, 8(sp) # 8-byte Folded Spill + ld a0, 0(a0) + call idw_halide_parallel_@plt + li a0, 0 + ld ra, 8(sp) # 8-byte Folded Reload + addi sp, sp, 16 + ret +.Lfunc_end3: + .size idw_halide_parallel__argv, .Lfunc_end3-idw_halide_parallel__argv + # -- End function + .section .text.idw_halide_parallel__metadata,"ax",@progbits + .globl idw_halide_parallel__metadata # -- Begin function idw_halide_parallel__metadata + .p2align 1 + .type idw_halide_parallel__metadata,@function +idw_halide_parallel__metadata: # @idw_halide_parallel__metadata +# %bb.0: # %entry +.Lpcrel_hi4: + auipc a0, %pcrel_hi(.Lidw_halide_parallel__metadata_storage) + addi a0, a0, %pcrel_lo(.Lpcrel_hi4) + ret +.Lfunc_end4: + .size idw_halide_parallel__metadata, .Lfunc_end4-idw_halide_parallel__metadata + # -- End function + .type .Lb24.shape,@object # @b24.shape + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lb24.shape: + .asciz "\000\000\000\000,\001\000\000\001\000\000\000\000\000\000" + .size .Lb24.shape, 16 + + .type .Lb24.data,@object # @b24.data + .p2align 5, 0x0 +.Lb24.data: + .asciz "\000\000\000\000\000\000\000\000H\000\000\000\000\000\000\000\325\000\000\000O\000\000\000\000\000\000\000\252\001\000\000<\000\000\000\000\000\000\000\200\002\000\000L\000\000\000\000\000\000\000U\003\000\000\200\000\000\000\000\000\000\000*\004\000\000C\000\000\000\000\000\000\000\000\005\000\000A\000\000\000\000\000\000\000\325\005\000\000@\000\000\000\000\000\000\000\252\006\000\000<\000\000\000\000\000\000\000\200\007\000\000=\000\000\000x\000\000\000\000\000\000\000Q\000\000\000x\000\000\000\325\000\000\000O\000\000\000x\000\000\000\252\001\000\000:\000\000\000x\000\000\000\200\002\000\000\204\000\000\000x\000\000\000U\003\000\000\225\000\000\000x\000\000\000*\004\000\000\216\000\000\000x\000\000\000\000\005\000\000@\000\000\000x\000\000\000\325\005\000\000E\000\000\000x\000\000\000\252\006\000\000A\000\000\000x\000\000\000\200\007\000\000@\000\000\000\360\000\000\000\000\000\000\000K\000\000\000\360\000\000\000\325\000\000\000D\000\000\000\360\000\000\000\252\001\000\000\214\000\000\000\360\000\000\000\200\002\000\000\231\000\000\000\360\000\000\000U\003\000\000\221\000\000\000\360\000\000\000*\004\000\000\204\000\000\000\360\000\000\000\000\005\000\000\230\000\000\000\360\000\000\000\325\005\000\000}\000\000\000\360\000\000\000\252\006\000\000B\000\000\000\360\000\000\000\200\007\000\000:\000\000\000h\001\000\000\000\000\000\000N\000\000\000h\001\000\000\325\000\000\000<\000\000\000h\001\000\000\252\001\000\000\213\000\000\000h\001\000\000\200\002\000\000\250\000\000\000h\001\000\000U\003\000\000\232\000\000\000h\001\000\000*\004\000\000\212\000\000\000h\001\000\000\000\005\000\000\221\000\000\000h\001\000\000\325\005\000\000\240\000\000\000h\001\000\000\252\006\000\000D\000\000\000h\001\000\000\200\007\000\000<\000\000\000\340\001\000\000\000\000\000\000M\000\000\000\340\001\000\000\325\000\000\000;\000\000\000\340\001\000\000\252\001\000\000\245\000\000\000\340\001\000\000\200\002\000\000\267\000\000\000\340\001\000\000U\003\000\000\246\000\000\000\340\001\000\000*\004\000\000\216\000\000\000\340\001\000\000\000\005\000\000{\000\000\000\340\001\000\000\325\005\000\000\233\000\000\000\340\001\000\000\252\006\000\000\220\000\000\000\340\001\000\000\200\007\000\000<\000\000\000X\002\000\000\000\000\000\000S\000\000\000X\002\000\000\325\000\000\000A\000\000\000X\002\000\000\252\001\000\000\262\000\000\000X\002\000\000\200\002\000\000\270\000\000\000X\002\000\000U\003\000\000\212\000\000\000X\002\000\000*\004\000\000|\000\000\000X\002\000\000\000\005\000\000\204\000\000\000X\002\000\000\325\005\000\000\257\000\000\000X\002\000\000\252\006\000\000\257\000\000\000X\002\000\000\200\007\000\000<\000\000\000\320\002\000\000\000\000\000\000V\000\000\000\320\002\000\000\325\000\000\000<\000\000\000\320\002\000\000\252\001\000\000\266\000\000\000\320\002\000\000\200\002\000\000\263\000\000\000\320\002\000\000U\003\000\000\230\000\000\000\320\002\000\000*\004\000\000\214\000\000\000\320\002\000\000\000\005\000\000s\000\000\000\320\002\000\000\325\005\000\000\234\000\000\000\320\002\000\000\252\006\000\000\254\000\000\000\320\002\000\000\200\007\000\000A\000\000\000H\003\000\000\000\000\000\000Z\000\000\000H\003\000\000\325\000\000\000E\000\000\000H\003\000\000\252\001\000\000\244\000\000\000H\003\000\000\200\002\000\000\272\000\000\000H\003\000\000U\003\000\000\222\000\000\000H\003\000\000*\004\000\000\223\000\000\000H\003\000\000\000\005\000\000z\000\000\000H\003\000\000\325\005\000\000\230\000\000\000H\003\000\000\252\006\000\000\246\000\000\000H\003\000\000\200\007\000\000E\000\000\000\300\003\000\000\000\000\000\000U\000\000\000\300\003\000\000\325\000\000\000C\000\000\000\300\003\000\000\252\001\000\000\272\000\000\000\300\003\000\000\200\002\000\000\251\000\000\000\300\003\000\000U\003\000\000\242\000\000\000\300\003\000\000*\004\000\000\223\000\000\000\300\003\000\000\000\005\000\000|\000\000\000\300\003\000\000\325\005\000\000\225\000\000\000\300\003\000\000\252\006\000\000\235\000\000\000\300\003\000\000\200\007\000\000E\000\000\0008\004\000\000\000\000\000\000U\000\000\0008\004\000\000\325\000\000\000F\000\000\0008\004\000\000\252\001\000\000\301\000\000\0008\004\000\000\200\002\000\000\300\000\000\0008\004\000\000U\003\000\000\233\000\000\0008\004\000\000*\004\000\000\207\000\000\0008\004\000\000\000\005\000\000x\000\000\0008\004\000\000\325\005\000\000\212\000\000\0008\004\000\000\252\006\000\000\200\000\000\0008\004\000\000\200\007\000\000F\000\000" + .size .Lb24.data, 1200 + + .type .Lb24.buffer,@object # @b24.buffer + .data + .p2align 4, 0x0 +.Lb24.buffer: + .quad 0 # 0x0 + .quad 0 + .quad .Lb24.data + .quad 1 # 0x1 + .byte 0 # 0x0 + .byte 32 # 0x20 + .half 1 # 0x1 + .word 1 # 0x1 + .quad .Lb24.shape + .quad 0 + .size .Lb24.buffer, 56 + + .type .Lb25.shape,@object # @b25.shape + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lb25.shape: + .asciz "\000\000\000\000d\000\000\000\001\000\000\000\000\000\000" + .size .Lb25.shape, 16 + + .type .Lb25.data,@object # @b25.data + .p2align 5, 0x0 +.Lb25.data: + .ascii "\211\004&=\244\022\233\275\217U%;\272)J>#\026\024\276@_\234>\340\243|\275\334\034K\275\232\300\004:A\313\245\274\303\024r\275\345\313a\275x\231\300>\201\334\"\276\236}\321\275H\346\257\276\202V\310>\225\375(>},\026\274E\235W\275\241\236\236<\245\220\222=\004\361\263\276\2620\240\274\265\036\240=\\:\336=\240\016\263\276\260\224\371\275p\235(=\311\\x;\226y\\\275\360\227\377=\275\364\200=rw/<@U\017=c\327\003=nV\267\273\3543\207\311\274<&{\274h\b\031>\203\222\003\276\213\3649\275d\177\017\276\330\306\246\275'\314:>\203\024\253=S\331M\276\257\263S=b\221q\275\336\t\306=#^\005\2763)\256\274\344\255G>\224!\374=\374\360o\275\215#\031\276\322`J\276y6\360=\273\303;\275N\"N>M\022K\276\016\035C=1Fb\275u\017\220\275\0040\033>,\037\223=t\335\304\275\363\344\271=\320\341\260\275\2265\255=\310\"b=&\031\330\275;\314\000>>\347o\275\255\222\020=C\rU<\370d\307\275\373\306\206=I\345\373\274Y|\034>WY;\276O\240?>B\347\226\275\203\271\203\275W\243\313<\214\364\343\274Ey\033\276J4\214=\261\304\241\274-\271p>X\313l\276*_\366\275\260^\\=\237l1=\371\272\200=\002\373?\274*\3578<]Gv=" + .size .Lb25.data, 400 + + .type .Lb25.buffer,@object # @b25.buffer + .data + .p2align 4, 0x0 +.Lb25.buffer: + .quad 0 # 0x0 + .quad 0 + .quad .Lb25.data + .quad 1 # 0x1 + .byte 2 # 0x2 + .byte 32 # 0x20 + .half 1 # 0x1 + .word 1 # 0x1 + .quad .Lb25.shape + .quad 0 + .size .Lb25.buffer, 56 + + .type .L__unnamed_1,@object # @0 + .section .rodata,"a",@progbits + .p2align 4, 0x0 +.L__unnamed_1: + .zero 32 + .size .L__unnamed_1, 32 + + .type .Lstr,@object # @str + .p2align 5, 0x0 +.Lstr: + .asciz "idw_halide_parallel_" + .size .Lstr, 21 + + .type .L__unnamed_2,@object # @1 + .section .data.rel.ro,"aw",@progbits + .p2align 4, 0x0 +.L__unnamed_2: + .quad .Lstr + .word 2 # 0x2 + .word 2 # 0x2 + .byte 2 # 0x2 + .byte 32 # 0x20 + .half 1 # 0x1 + .zero 4 + .quad 0 + .quad 0 + .quad 0 + .quad 0 + .quad .L__unnamed_1 + .size .L__unnamed_2, 64 + + .type .Lstr.4,@object # @str.4 + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lstr.4: + .asciz "riscv-64-linux-no_asserts-no_runtime-rvv-vector_bits_128" + .size .Lstr.4, 57 + + .type .Lidw_halide_parallel__metadata_storage,@object # @idw_halide_parallel__metadata_storage + .section .data.rel.ro,"aw",@progbits + .p2align 4, 0x0 +.Lidw_halide_parallel__metadata_storage: + .word 1 # 0x1 + .word 1 # 0x1 + .quad .L__unnamed_2 + .quad .Lstr.4 + .quad .Lstr + .size .Lidw_halide_parallel__metadata_storage, 32 + + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .section ".note.GNU-stack","",@progbits diff --git a/aot/idw_halide_parallel_vec.h b/aot/idw_halide_parallel_vec.h new file mode 100644 index 0000000..156229d --- /dev/null +++ b/aot/idw_halide_parallel_vec.h @@ -0,0 +1,56 @@ +#ifndef HALIDE__idw_halide_parallel_vec_h +#define HALIDE__idw_halide_parallel_vec_h +#include + +// Forward declarations of the types used in the interface +// to the Halide pipeline. +// +// For the definitions of these structs, include HalideRuntime.h + +// Halide's representation of a multi-dimensional array. +// Halide::Runtime::Buffer is a more user-friendly wrapper +// around this. Its declaration is in HalideBuffer.h +struct halide_buffer_t; + +// Metadata describing the arguments to the generated function. +// Used to construct calls to the _argv version of the function. +struct halide_filter_metadata_t; + +#ifndef HALIDE_MUST_USE_RESULT +#ifdef __has_attribute +#if __has_attribute(nodiscard) +#define HALIDE_MUST_USE_RESULT [[nodiscard]] +#elif __has_attribute(warn_unused_result) +#define HALIDE_MUST_USE_RESULT __attribute__((warn_unused_result)) +#else +#define HALIDE_MUST_USE_RESULT +#endif +#else +#define HALIDE_MUST_USE_RESULT +#endif +#endif + +#ifndef HALIDE_FUNCTION_ATTRS +#define HALIDE_FUNCTION_ATTRS +#endif + + + +#ifdef __cplusplus +extern "C" { +#endif + +HALIDE_FUNCTION_ATTRS +int idw_halide_parallel_vec_(struct halide_buffer_t *_idw_halide_parallel_vec__buffer); + +HALIDE_FUNCTION_ATTRS +int idw_halide_parallel_vec__argv(void **args); + +HALIDE_FUNCTION_ATTRS +const struct halide_filter_metadata_t *idw_halide_parallel_vec__metadata(); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/aot/idw_halide_parallel_vec.s b/aot/idw_halide_parallel_vec.s new file mode 100644 index 0000000..484d582 --- /dev/null +++ b/aot/idw_halide_parallel_vec.s @@ -0,0 +1,739 @@ + .text + .attribute 4, 16 + .attribute 5, "rv64gcv0p7" + .file "halide_buffer_t.cpp" + .section .text.idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s0_x_x_outer_tile_index__1,"ax",@progbits + .globl idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s0_x_x_outer_tile_index__1 # -- Begin function idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s0_x_x_outer_tile_index__1 + .p2align 1 + .type idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s0_x_x_outer_tile_index__1,@function +idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s0_x_x_outer_tile_index__1: # @idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s0_x_x_outer_tile_index__1 +# %bb.0: # %entry + addi sp, sp, -48 + sd ra, 40(sp) # 8-byte Folded Spill + sd s0, 32(sp) # 8-byte Folded Spill + sd s1, 24(sp) # 8-byte Folded Spill + sd s2, 16(sp) # 8-byte Folded Spill + sd s3, 8(sp) # 8-byte Folded Spill + lw a4, 24(a2) + lw a7, 8(a2) + lw a6, 12(a2) + sraiw a3, a1, 31 + seqz a5, a4 + negw a0, a5 + srai s1, a4, 31 + subw t0, a1, a3 + or a0, a0, a4 + remw a1, t0, a0 + xor s0, s1, a4 + not a0, s1 + add s0, s0, a0 + and s0, s0, a3 + add a1, a1, s0 + li s0, 480 + mulw a1, a1, s0 + addi s0, a5, -1 + and t2, s0, a1 + addiw a1, a7, -480 + blt t2, a1, .LBB0_2 +# %bb.1: # %entry + mv t2, a1 +.LBB0_2: # %entry + lw a7, 16(a2) + lw s2, 20(a2) + lw t1, 28(a2) + add a4, a4, a5 + divw a1, t0, a4 + subw a0, a0, s1 + and a0, a0, a3 + add a0, a0, a1 + li s1, 270 + mulw a0, a0, s1 + and a0, a0, s0 + addiw a1, a6, -270 + blt a0, a1, .LBB0_4 +# %bb.3: # %entry + mv a0, a1 +.LBB0_4: # %entry + ld s3, 0(a2) + add a0, a0, a7 + add t1, t1, t2 + mulw a0, s2, a0 + addw s0, t1, a0 +.LBB0_5: # %"for idw_halide_parallel_vec_.s0.y.y_inner" + # =>This Inner Loop Header: Depth=1 + slli a0, s0, 2 + add a0, a0, s3 + li a2, 1920 + li a1, 0 + call memset@plt + addi s1, s1, -1 + addw s0, s0, s2 + bnez s1, .LBB0_5 +# %bb.6: # %destructor_block + li a0, 0 + ld ra, 40(sp) # 8-byte Folded Reload + ld s0, 32(sp) # 8-byte Folded Reload + ld s1, 24(sp) # 8-byte Folded Reload + ld s2, 16(sp) # 8-byte Folded Reload + ld s3, 8(sp) # 8-byte Folded Reload + addi sp, sp, 48 + ret +.Lfunc_end0: + .size idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s0_x_x_outer_tile_index__1, .Lfunc_end0-idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s0_x_x_outer_tile_index__1 + # -- End function + .section .text.idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s1_x_x_outer_tile_index__1,"ax",@progbits + .globl idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s1_x_x_outer_tile_index__1 # -- Begin function idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s1_x_x_outer_tile_index__1 + .p2align 1 + .type idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s1_x_x_outer_tile_index__1,@function +idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s1_x_x_outer_tile_index__1: # @idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s1_x_x_outer_tile_index__1 +# %bb.0: # %entry + addi sp, sp, -208 + sd ra, 200(sp) # 8-byte Folded Spill + sd s0, 192(sp) # 8-byte Folded Spill + sd s1, 184(sp) # 8-byte Folded Spill + sd s2, 176(sp) # 8-byte Folded Spill + sd s3, 168(sp) # 8-byte Folded Spill + sd s4, 160(sp) # 8-byte Folded Spill + sd s5, 152(sp) # 8-byte Folded Spill + sd s6, 144(sp) # 8-byte Folded Spill + sd s7, 136(sp) # 8-byte Folded Spill + sd s8, 128(sp) # 8-byte Folded Spill + sd s9, 120(sp) # 8-byte Folded Spill + sd s10, 112(sp) # 8-byte Folded Spill + sd s11, 104(sp) # 8-byte Folded Spill + li a3, 0 + ld t4, 0(a2) + ld t0, 8(a2) + ld a0, 16(a2) + sd a0, 48(sp) # 8-byte Folded Spill + lw a0, 24(a2) + sd a0, 40(sp) # 8-byte Folded Spill + lw a7, 28(a2) + lw a4, 36(a2) + lw a0, 32(a2) + sd a0, 8(sp) # 8-byte Folded Spill + lw a0, 40(a2) + sd a0, 32(sp) # 8-byte Folded Spill + sraiw a2, a1, 31 + seqz a5, a4 + negw s1, a5 + srai s0, a4, 31 + subw a1, a1, a2 + or s1, s1, a4 + remw t1, a1, s1 + xor a0, s0, a4 + not s1, s0 + add a0, a0, s1 + and a0, a0, a2 + add a0, a0, t1 + add a4, a4, a5 + divw a1, a1, a4 + subw s1, s1, s0 + and a2, a2, s1 + add a1, a1, a2 + li a2, 480 + mulw a0, a0, a2 + addi a5, a5, -1 + and a0, a0, a5 + sd a0, 24(sp) # 8-byte Folded Spill + li a0, 270 + mulw a0, a1, a0 + and a0, a0, a5 + add a0, a0, a7 + sd a0, 0(sp) # 8-byte Folded Spill + li s2, 48 + li a0, 6 + li a2, 1 + addi t5, sp, 80 + addi s3, sp, 92 + addi s4, sp, 88 + addi s5, sp, 84 + li s0, 4 + addi t6, sp, 64 + addi s6, sp, 68 + addi s7, sp, 76 + addi s8, sp, 72 + li s9, 2 + li s10, 3 + li ra, 25 +.LBB1_1: # %"for idw_halide_parallel_vec_.s1.y.y_inner" + # =>This Loop Header: Depth=1 + # Child Loop BB1_2 Depth 2 + # Child Loop BB1_3 Depth 3 + # Child Loop BB1_4 Depth 4 + # Child Loop BB1_9 Depth 5 + li s11, 0 + sd a3, 16(sp) # 8-byte Folded Spill + ld a1, 0(sp) # 8-byte Folded Reload + addw t1, a1, a3 + ld a1, 8(sp) # 8-byte Folded Reload + mulw a1, t1, a1 + sd a1, 56(sp) # 8-byte Folded Spill +.LBB1_2: # %"for idw_halide_parallel_vec_.s1.x.x_inner" + # Parent Loop BB1_1 Depth=1 + # => This Loop Header: Depth=2 + # Child Loop BB1_3 Depth 3 + # Child Loop BB1_4 Depth 4 + # Child Loop BB1_9 Depth 5 + li a7, 0 + ld t2, 24(sp) # 8-byte Folded Reload + add t2, t2, s11 + ld a4, 32(sp) # 8-byte Folded Reload + add a4, a4, t2 + ld a1, 56(sp) # 8-byte Folded Reload + addw a4, a4, a1 + ld a1, 40(sp) # 8-byte Folded Reload + addw t2, t2, a1 + slli a4, a4, 2 + ld t3, 48(sp) # 8-byte Folded Reload + add t3, t3, a4 +.LBB1_3: # %"for idw_halide_parallel_vec_.s1.r52$x.r52$x" + # Parent Loop BB1_1 Depth=1 + # Parent Loop BB1_2 Depth=2 + # => This Loop Header: Depth=3 + # Child Loop BB1_4 Depth 4 + # Child Loop BB1_9 Depth 5 + mul a4, a7, s2 + add a4, a4, t4 + addi a6, a4, 4 + vsetvli zero, a0, e32, m2 + vlwu.v v8, (a6) + vsetvli zero, a2, e32, m2 + vslidedown.vi v10, v8, 3 + addi a3, a4, 20 + vsetvli zero, a0, e32, m2 + vlwu.v v12, (a3) + vsetvli zero, a2, e32, m2 + vslidedown.vi v14, v12, 2 + vslidedown.vi v12, v12, 5 + vsw.v v8, (t5) + vsw.v v12, (s3) + vsw.v v14, (s4) + vsw.v v10, (s5) + vsetvli zero, s0, e32, m1 + vlwu.v v8, (t5) + vsetvli a3, zero, e32, m1 + vrsub.vx v8, v8, t2 + vsetvli zero, a0, e32, m2 + vlwu.v v10, (a4) + vsetvli zero, a2, e32, m2 + vslidedown.vi v12, v10, 3 + addi a4, a4, 16 + vsetvli zero, a0, e32, m2 + vlwu.v v14, (a4) + vsetvli zero, a2, e32, m2 + vslidedown.vi v16, v14, 2 + vslidedown.vi v14, v14, 5 + vsw.v v10, (t6) + vsw.v v12, (s6) + vsw.v v14, (s7) + vsw.v v16, (s8) + vsetvli zero, s0, e32, m1 + vlwu.v v9, (t6) + vsetvli a3, zero, e32, m1 + lw a3, 0(t3) + vrsub.vx v9, v9, t1 + fmv.w.x ft0, a3 + vmul.vv v9, v9, v9 + vmacc.vv v9, v8, v8 + vfcvt.f.x.v v9, v9 + vfmv.f.s ft1, v9 + fsqrt.s ft1, ft1 + vfmv.s.f v8, ft1 + vsetvli zero, a2, e32, m1 + vslidedown.vi v10, v9, 1 + vfmv.f.s ft1, v10 + fsqrt.s ft1, ft1 + vsetvli a3, zero, e32, m1 + vfmv.s.f v10, ft1 + vsetvli zero, s9, e32, m1 + vslideup.vi v8, v10, 1 + vsetvli zero, a2, e32, m1 + vslidedown.vi v10, v9, 2 + vfmv.f.s ft1, v10 + fsqrt.s ft1, ft1 + vsetvli a3, zero, e32, m1 + vfmv.s.f v10, ft1 + vsetvli zero, s10, e32, m1 + vslideup.vi v8, v10, 2 + vsetvli zero, a2, e32, m1 + vslidedown.vi v9, v9, 3 + vfmv.f.s ft1, v9 + fsqrt.s ft1, ft1 + vsetvli a3, zero, e32, m1 + vfmv.s.f v9, ft1 + vsetvli zero, s0, e32, m1 + vslideup.vi v8, v9, 3 + slli a4, a7, 4 + add a4, a4, t0 +.LBB1_4: # %casloop.start + # Parent Loop BB1_1 Depth=1 + # Parent Loop BB1_2 Depth=2 + # Parent Loop BB1_3 Depth=3 + # => This Loop Header: Depth=4 + # Child Loop BB1_9 Depth 5 + flw ft1, 0(t3) + vsetvli zero, zero, e32, m1 + vlwu.v v9, (a4) + vfmul.vv v9, v8, v9 + vfmv.s.f v10, ft1 + vfredsum.vs v9, v9, v10 + vfmv.f.s ft1, v9 + fmv.x.w a3, ft1 + fmv.x.w a1, ft0 +.LBB1_9: # %casloop.start + # Parent Loop BB1_1 Depth=1 + # Parent Loop BB1_2 Depth=2 + # Parent Loop BB1_3 Depth=3 + # Parent Loop BB1_4 Depth=4 + # => This Inner Loop Header: Depth=5 + lr.w s1, (t3) + bne s1, a1, .LBB1_11 +# %bb.10: # %casloop.start + # in Loop: Header=BB1_9 Depth=5 + sc.w a5, a3, (t3) + bnez a5, .LBB1_9 +.LBB1_11: # %casloop.start + # in Loop: Header=BB1_4 Depth=4 + fmv.w.x ft0, s1 + bne s1, a1, .LBB1_4 +# %bb.5: # %casloop.end + # in Loop: Header=BB1_3 Depth=3 + addi a7, a7, 1 + bne a7, ra, .LBB1_3 +# %bb.6: # %"end for idw_halide_parallel_vec_.s1.r52$x.r52$x" + # in Loop: Header=BB1_2 Depth=2 + addi s11, s11, 1 + li a1, 480 + bne s11, a1, .LBB1_2 +# %bb.7: # %"end for idw_halide_parallel_vec_.s1.x.x_inner" + # in Loop: Header=BB1_1 Depth=1 + ld a3, 16(sp) # 8-byte Folded Reload + addiw a3, a3, 1 + li a1, 270 + bne a3, a1, .LBB1_1 +# %bb.8: # %destructor_block + li a0, 0 + ld ra, 200(sp) # 8-byte Folded Reload + ld s0, 192(sp) # 8-byte Folded Reload + ld s1, 184(sp) # 8-byte Folded Reload + ld s2, 176(sp) # 8-byte Folded Reload + ld s3, 168(sp) # 8-byte Folded Reload + ld s4, 160(sp) # 8-byte Folded Reload + ld s5, 152(sp) # 8-byte Folded Reload + ld s6, 144(sp) # 8-byte Folded Reload + ld s7, 136(sp) # 8-byte Folded Reload + ld s8, 128(sp) # 8-byte Folded Reload + ld s9, 120(sp) # 8-byte Folded Reload + ld s10, 112(sp) # 8-byte Folded Reload + ld s11, 104(sp) # 8-byte Folded Reload + addi sp, sp, 208 + ret +.Lfunc_end1: + .size idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s1_x_x_outer_tile_index__1, .Lfunc_end1-idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s1_x_x_outer_tile_index__1 + # -- End function + .section .text.idw_halide_parallel_vec_,"ax",@progbits + .globl idw_halide_parallel_vec_ # -- Begin function idw_halide_parallel_vec_ + .p2align 1 + .type idw_halide_parallel_vec_,@function +idw_halide_parallel_vec_: # @idw_halide_parallel_vec_ +# %bb.0: # %entry + addi sp, sp, -176 + sd ra, 168(sp) # 8-byte Folded Spill + sd s0, 160(sp) # 8-byte Folded Spill + sd s1, 152(sp) # 8-byte Folded Spill + sd s2, 144(sp) # 8-byte Folded Spill + sd s3, 136(sp) # 8-byte Folded Spill + sd s4, 128(sp) # 8-byte Folded Spill + sd s5, 120(sp) # 8-byte Folded Spill + sd s6, 112(sp) # 8-byte Folded Spill + sd s7, 104(sp) # 8-byte Folded Spill + sd s8, 96(sp) # 8-byte Folded Spill + sd s9, 88(sp) # 8-byte Folded Spill + ld t5, 40(a0) + lw a4, 4(t5) + lw t2, 20(t5) + li a2, -480 + subw a6, a2, a4 + addiw a2, a4, -1 + blt a2, a6, .LBB2_2 +# %bb.1: # %entry + mv a6, a2 +.LBB2_2: # %entry +.Lpcrel_hi0: + auipc a2, %pcrel_hi(.Lb34.buffer) + addi a3, a2, %pcrel_lo(.Lpcrel_hi0) + sgtz t4, a4 + slti a5, a4, -479 + slti s0, t2, -269 + lui s1, 559241 + li a7, 480 + addiw t1, s1, -1911 + mv t0, a4 + blt a4, a7, .LBB2_4 +# %bb.3: # %entry + li t0, 480 +.LBB2_4: # %entry +.Lpcrel_hi1: + auipc t6, %pcrel_hi(.Lb35.buffer) + ld s2, 16(a3) + add a1, a4, a5 + addiw a1, a1, 479 + mul a2, a1, t1 + srli a2, a2, 32 + add a1, a1, a2 + srliw a2, a1, 31 + sraiw a1, a1, 8 + add a1, a1, a2 + subw s9, a1, a5 + add a1, t2, s0 + addiw a1, a1, 269 + lui a2, 994205 + addiw a2, a2, 1609 + mul a2, a1, a2 + srli a2, a2, 32 + add a1, a1, a2 + srliw a2, a1, 31 + sraiw a1, a1, 8 + add a1, a1, a2 + subw a1, a1, s0 + mulw s7, a1, s9 + slti a1, s7, 1 + negw a2, a1 + seqz s1, s9 + add a1, a1, s7 + addiw a1, a1, -1 + add s0, s9, s1 + divw a1, a1, s0 + add a5, a5, a5 + addiw a5, a5, -1 + and a2, a2, a5 + add a1, a1, a2 + li a2, 270 + mulw a1, a1, a2 + addi s1, s1, -1 + and t3, s1, a1 + addi t4, t4, -1 + and t4, t4, t3 + addiw a2, t2, -270 + addi a5, t6, %pcrel_lo(.Lpcrel_hi1) + blt t4, a2, .LBB2_6 +# %bb.5: # %entry + mv t4, a2 +.LBB2_6: # %entry + ld s4, 16(a5) + lw s3, 0(t5) + ld a1, 0(a3) + lw s6, 16(t5) + lw s5, 24(t5) + or a1, s2, a1 + mv s0, s4 + beqz a1, .LBB2_10 +# %bb.7: # %after_bb + ld a1, 0(a5) + ld s8, 16(a0) + or a1, a1, s0 + beqz a1, .LBB2_11 +.LBB2_8: # %after_bb1 + beqz s8, .LBB2_12 +.LBB2_9: + li a0, 0 + j .LBB2_18 +.LBB2_10: # %then_bb + lui a1, 128 + addiw a1, a1, 9 + slli a1, a1, 13 + sd a1, 32(a3) + ld a1, 40(a3) + sd zero, 0(a3) + sd zero, 8(a3) + sd zero, 16(a3) + sw zero, 0(a1) + li a2, 300 + sw a2, 4(a1) + li a2, 1 + sw a2, 8(a1) + sw zero, 12(a1) + ld s0, 16(a5) + sd zero, 24(a3) + ld a1, 0(a5) + ld s8, 16(a0) + or a1, a1, s0 + bnez a1, .LBB2_8 +.LBB2_11: # %then_bb2 + sd zero, 16(a5) + sd zero, 8(a5) + sd zero, 0(a5) + lui a1, 128 + addiw a1, a1, 9 + ld a2, 40(a5) + slli a1, a1, 13 + addi a1, a1, 2 + sd a1, 32(a5) + sw zero, 0(a2) + li a1, 100 + sw a1, 4(a2) + li a1, 1 + sw a1, 8(a2) + sw zero, 12(a2) + sd zero, 24(a5) + bnez s8, .LBB2_9 +.LBB2_12: # %_halide_buffer_is_bounds_query.exit31 + ld a1, 0(a0) + bnez a1, .LBB2_17 +# %bb.13: # %then_bb5 + srai a1, a6, 31 + subw a2, a6, a1 + mul s1, a2, t1 + srli s1, s1, 32 + add a2, a2, s1 + srliw s1, a2, 31 + sraiw a2, a2, 8 + addw a2, a2, s1 + add a1, a1, a2 + sgtz a2, a1 + negw a2, a2 + and a1, a1, a2 + mulw a1, a1, a7 + subw a1, a1, t0 + add t0, t0, s3 + addiw t0, t0, -480 + addiw a6, a1, 960 + add a7, t4, s6 + bgtz a4, .LBB2_15 +# %bb.14: # %then_bb5 + li s0, 270 + j .LBB2_16 +.LBB2_15: + addi s0, t3, 270 +.LBB2_16: # %then_bb5 + subw a1, s0, t4 + sd zero, 16(a0) + sd zero, 8(a0) + sd zero, 0(a0) + lui a2, 256 + addiw a2, a2, 9 + slli a2, a2, 13 + addi a2, a2, 2 + sd a2, 32(a0) + sw t0, 0(t5) + sw a6, 4(t5) + li a2, 1 + sw a2, 8(t5) + sw zero, 12(t5) + sw a7, 16(t5) + sw a1, 20(t5) + sw a6, 24(t5) + sw zero, 28(t5) + sd zero, 24(a0) +.LBB2_17: # %land.rhs.i44 + ld a0, 0(a0) + seqz a0, a0 +.LBB2_18: # %_halide_buffer_is_bounds_query.exit45 + ld a1, 16(a3) + ld a2, 0(a3) + ld a3, 16(a5) + ld a5, 0(a5) + or a1, a1, a2 + seqz a1, a1 + or a3, a3, a5 + seqz a2, a3 + or a1, a1, a2 + or a0, a0, a1 + bnez a0, .LBB2_20 +# %bb.19: # %then_bb8 + mulw a0, s6, s5 + negw s0, a0 + sd s8, 8(sp) + sw a4, 16(sp) + sw t2, 20(sp) + sw s6, 24(sp) + sw s5, 28(sp) + sw s9, 32(sp) + sw s0, 36(sp) +.Lpcrel_hi2: + auipc a0, %got_pcrel_hi(idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s0_x_x_outer_tile_index__1) + ld a1, %pcrel_lo(.Lpcrel_hi2)(a0) + addi a4, sp, 8 + li a0, 0 + li a2, 0 + mv a3, s7 + call halide_do_par_for@plt + sd s2, 40(sp) + sd s4, 48(sp) + sd s8, 56(sp) + sw s3, 64(sp) + sw s6, 68(sp) + sw s5, 72(sp) + sw s9, 76(sp) + sw s0, 80(sp) +.Lpcrel_hi3: + auipc a0, %got_pcrel_hi(idw_halide_parallel_vec__par_for_idw_halide_parallel_vec__s1_x_x_outer_tile_index__1) + ld a1, %pcrel_lo(.Lpcrel_hi3)(a0) + addi a4, sp, 40 + li a0, 0 + li a2, 0 + mv a3, s7 + call halide_do_par_for@plt +.LBB2_20: # %destructor_block + li a0, 0 + ld ra, 168(sp) # 8-byte Folded Reload + ld s0, 160(sp) # 8-byte Folded Reload + ld s1, 152(sp) # 8-byte Folded Reload + ld s2, 144(sp) # 8-byte Folded Reload + ld s3, 136(sp) # 8-byte Folded Reload + ld s4, 128(sp) # 8-byte Folded Reload + ld s5, 120(sp) # 8-byte Folded Reload + ld s6, 112(sp) # 8-byte Folded Reload + ld s7, 104(sp) # 8-byte Folded Reload + ld s8, 96(sp) # 8-byte Folded Reload + ld s9, 88(sp) # 8-byte Folded Reload + addi sp, sp, 176 + ret +.Lfunc_end2: + .size idw_halide_parallel_vec_, .Lfunc_end2-idw_halide_parallel_vec_ + # -- End function + .section .text.idw_halide_parallel_vec__argv,"ax",@progbits + .globl idw_halide_parallel_vec__argv # -- Begin function idw_halide_parallel_vec__argv + .p2align 1 + .type idw_halide_parallel_vec__argv,@function +idw_halide_parallel_vec__argv: # @idw_halide_parallel_vec__argv +# %bb.0: # %entry + addi sp, sp, -16 + sd ra, 8(sp) # 8-byte Folded Spill + ld a0, 0(a0) + call idw_halide_parallel_vec_@plt + li a0, 0 + ld ra, 8(sp) # 8-byte Folded Reload + addi sp, sp, 16 + ret +.Lfunc_end3: + .size idw_halide_parallel_vec__argv, .Lfunc_end3-idw_halide_parallel_vec__argv + # -- End function + .section .text.idw_halide_parallel_vec__metadata,"ax",@progbits + .globl idw_halide_parallel_vec__metadata # -- Begin function idw_halide_parallel_vec__metadata + .p2align 1 + .type idw_halide_parallel_vec__metadata,@function +idw_halide_parallel_vec__metadata: # @idw_halide_parallel_vec__metadata +# %bb.0: # %entry +.Lpcrel_hi4: + auipc a0, %pcrel_hi(.Lidw_halide_parallel_vec__metadata_storage) + addi a0, a0, %pcrel_lo(.Lpcrel_hi4) + ret +.Lfunc_end4: + .size idw_halide_parallel_vec__metadata, .Lfunc_end4-idw_halide_parallel_vec__metadata + # -- End function + .type .Lb34.shape,@object # @b34.shape + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lb34.shape: + .asciz "\000\000\000\000,\001\000\000\001\000\000\000\000\000\000" + .size .Lb34.shape, 16 + + .type .Lb34.data,@object # @b34.data + .p2align 5, 0x0 +.Lb34.data: + .asciz "\000\000\000\000\000\000\000\000H\000\000\000\000\000\000\000\325\000\000\000O\000\000\000\000\000\000\000\252\001\000\000<\000\000\000\000\000\000\000\200\002\000\000L\000\000\000\000\000\000\000U\003\000\000\200\000\000\000\000\000\000\000*\004\000\000C\000\000\000\000\000\000\000\000\005\000\000A\000\000\000\000\000\000\000\325\005\000\000@\000\000\000\000\000\000\000\252\006\000\000<\000\000\000\000\000\000\000\200\007\000\000=\000\000\000x\000\000\000\000\000\000\000Q\000\000\000x\000\000\000\325\000\000\000O\000\000\000x\000\000\000\252\001\000\000:\000\000\000x\000\000\000\200\002\000\000\204\000\000\000x\000\000\000U\003\000\000\225\000\000\000x\000\000\000*\004\000\000\216\000\000\000x\000\000\000\000\005\000\000@\000\000\000x\000\000\000\325\005\000\000E\000\000\000x\000\000\000\252\006\000\000A\000\000\000x\000\000\000\200\007\000\000@\000\000\000\360\000\000\000\000\000\000\000K\000\000\000\360\000\000\000\325\000\000\000D\000\000\000\360\000\000\000\252\001\000\000\214\000\000\000\360\000\000\000\200\002\000\000\231\000\000\000\360\000\000\000U\003\000\000\221\000\000\000\360\000\000\000*\004\000\000\204\000\000\000\360\000\000\000\000\005\000\000\230\000\000\000\360\000\000\000\325\005\000\000}\000\000\000\360\000\000\000\252\006\000\000B\000\000\000\360\000\000\000\200\007\000\000:\000\000\000h\001\000\000\000\000\000\000N\000\000\000h\001\000\000\325\000\000\000<\000\000\000h\001\000\000\252\001\000\000\213\000\000\000h\001\000\000\200\002\000\000\250\000\000\000h\001\000\000U\003\000\000\232\000\000\000h\001\000\000*\004\000\000\212\000\000\000h\001\000\000\000\005\000\000\221\000\000\000h\001\000\000\325\005\000\000\240\000\000\000h\001\000\000\252\006\000\000D\000\000\000h\001\000\000\200\007\000\000<\000\000\000\340\001\000\000\000\000\000\000M\000\000\000\340\001\000\000\325\000\000\000;\000\000\000\340\001\000\000\252\001\000\000\245\000\000\000\340\001\000\000\200\002\000\000\267\000\000\000\340\001\000\000U\003\000\000\246\000\000\000\340\001\000\000*\004\000\000\216\000\000\000\340\001\000\000\000\005\000\000{\000\000\000\340\001\000\000\325\005\000\000\233\000\000\000\340\001\000\000\252\006\000\000\220\000\000\000\340\001\000\000\200\007\000\000<\000\000\000X\002\000\000\000\000\000\000S\000\000\000X\002\000\000\325\000\000\000A\000\000\000X\002\000\000\252\001\000\000\262\000\000\000X\002\000\000\200\002\000\000\270\000\000\000X\002\000\000U\003\000\000\212\000\000\000X\002\000\000*\004\000\000|\000\000\000X\002\000\000\000\005\000\000\204\000\000\000X\002\000\000\325\005\000\000\257\000\000\000X\002\000\000\252\006\000\000\257\000\000\000X\002\000\000\200\007\000\000<\000\000\000\320\002\000\000\000\000\000\000V\000\000\000\320\002\000\000\325\000\000\000<\000\000\000\320\002\000\000\252\001\000\000\266\000\000\000\320\002\000\000\200\002\000\000\263\000\000\000\320\002\000\000U\003\000\000\230\000\000\000\320\002\000\000*\004\000\000\214\000\000\000\320\002\000\000\000\005\000\000s\000\000\000\320\002\000\000\325\005\000\000\234\000\000\000\320\002\000\000\252\006\000\000\254\000\000\000\320\002\000\000\200\007\000\000A\000\000\000H\003\000\000\000\000\000\000Z\000\000\000H\003\000\000\325\000\000\000E\000\000\000H\003\000\000\252\001\000\000\244\000\000\000H\003\000\000\200\002\000\000\272\000\000\000H\003\000\000U\003\000\000\222\000\000\000H\003\000\000*\004\000\000\223\000\000\000H\003\000\000\000\005\000\000z\000\000\000H\003\000\000\325\005\000\000\230\000\000\000H\003\000\000\252\006\000\000\246\000\000\000H\003\000\000\200\007\000\000E\000\000\000\300\003\000\000\000\000\000\000U\000\000\000\300\003\000\000\325\000\000\000C\000\000\000\300\003\000\000\252\001\000\000\272\000\000\000\300\003\000\000\200\002\000\000\251\000\000\000\300\003\000\000U\003\000\000\242\000\000\000\300\003\000\000*\004\000\000\223\000\000\000\300\003\000\000\000\005\000\000|\000\000\000\300\003\000\000\325\005\000\000\225\000\000\000\300\003\000\000\252\006\000\000\235\000\000\000\300\003\000\000\200\007\000\000E\000\000\0008\004\000\000\000\000\000\000U\000\000\0008\004\000\000\325\000\000\000F\000\000\0008\004\000\000\252\001\000\000\301\000\000\0008\004\000\000\200\002\000\000\300\000\000\0008\004\000\000U\003\000\000\233\000\000\0008\004\000\000*\004\000\000\207\000\000\0008\004\000\000\000\005\000\000x\000\000\0008\004\000\000\325\005\000\000\212\000\000\0008\004\000\000\252\006\000\000\200\000\000\0008\004\000\000\200\007\000\000F\000\000" + .size .Lb34.data, 1200 + + .type .Lb34.buffer,@object # @b34.buffer + .data + .p2align 4, 0x0 +.Lb34.buffer: + .quad 0 # 0x0 + .quad 0 + .quad .Lb34.data + .quad 1 # 0x1 + .byte 0 # 0x0 + .byte 32 # 0x20 + .half 1 # 0x1 + .word 1 # 0x1 + .quad .Lb34.shape + .quad 0 + .size .Lb34.buffer, 56 + + .type .Lb35.shape,@object # @b35.shape + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lb35.shape: + .asciz "\000\000\000\000d\000\000\000\001\000\000\000\000\000\000" + .size .Lb35.shape, 16 + + .type .Lb35.data,@object # @b35.data + .p2align 5, 0x0 +.Lb35.data: + .ascii "\211\004&=\244\022\233\275\217U%;\272)J>#\026\024\276@_\234>\340\243|\275\334\034K\275\232\300\004:A\313\245\274\303\024r\275\345\313a\275x\231\300>\201\334\"\276\236}\321\275H\346\257\276\202V\310>\225\375(>},\026\274E\235W\275\241\236\236<\245\220\222=\004\361\263\276\2620\240\274\265\036\240=\\:\336=\240\016\263\276\260\224\371\275p\235(=\311\\x;\226y\\\275\360\227\377=\275\364\200=rw/<@U\017=c\327\003=nV\267\273\3543\207\311\274<&{\274h\b\031>\203\222\003\276\213\3649\275d\177\017\276\330\306\246\275'\314:>\203\024\253=S\331M\276\257\263S=b\221q\275\336\t\306=#^\005\2763)\256\274\344\255G>\224!\374=\374\360o\275\215#\031\276\322`J\276y6\360=\273\303;\275N\"N>M\022K\276\016\035C=1Fb\275u\017\220\275\0040\033>,\037\223=t\335\304\275\363\344\271=\320\341\260\275\2265\255=\310\"b=&\031\330\275;\314\000>>\347o\275\255\222\020=C\rU<\370d\307\275\373\306\206=I\345\373\274Y|\034>WY;\276O\240?>B\347\226\275\203\271\203\275W\243\313<\214\364\343\274Ey\033\276J4\214=\261\304\241\274-\271p>X\313l\276*_\366\275\260^\\=\237l1=\371\272\200=\002\373?\274*\3578<]Gv=" + .size .Lb35.data, 400 + + .type .Lb35.buffer,@object # @b35.buffer + .data + .p2align 4, 0x0 +.Lb35.buffer: + .quad 0 # 0x0 + .quad 0 + .quad .Lb35.data + .quad 1 # 0x1 + .byte 2 # 0x2 + .byte 32 # 0x20 + .half 1 # 0x1 + .word 1 # 0x1 + .quad .Lb35.shape + .quad 0 + .size .Lb35.buffer, 56 + + .type .L__unnamed_1,@object # @0 + .section .rodata,"a",@progbits + .p2align 4, 0x0 +.L__unnamed_1: + .zero 32 + .size .L__unnamed_1, 32 + + .type .Lstr,@object # @str + .p2align 5, 0x0 +.Lstr: + .asciz "idw_halide_parallel_vec_" + .size .Lstr, 25 + + .type .L__unnamed_2,@object # @1 + .section .data.rel.ro,"aw",@progbits + .p2align 4, 0x0 +.L__unnamed_2: + .quad .Lstr + .word 2 # 0x2 + .word 2 # 0x2 + .byte 2 # 0x2 + .byte 32 # 0x20 + .half 1 # 0x1 + .zero 4 + .quad 0 + .quad 0 + .quad 0 + .quad 0 + .quad .L__unnamed_1 + .size .L__unnamed_2, 64 + + .type .Lstr.4,@object # @str.4 + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lstr.4: + .asciz "riscv-64-linux-no_asserts-no_runtime-rvv-vector_bits_128" + .size .Lstr.4, 57 + + .type .Lidw_halide_parallel_vec__metadata_storage,@object # @idw_halide_parallel_vec__metadata_storage + .section .data.rel.ro,"aw",@progbits + .p2align 4, 0x0 +.Lidw_halide_parallel_vec__metadata_storage: + .word 1 # 0x1 + .word 1 # 0x1 + .quad .L__unnamed_2 + .quad .Lstr.4 + .quad .Lstr + .size .Lidw_halide_parallel_vec__metadata_storage, 32 + + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .section ".note.GNU-stack","",@progbits diff --git a/aot/idw_halide_vec.h b/aot/idw_halide_vec.h new file mode 100644 index 0000000..6de0b40 --- /dev/null +++ b/aot/idw_halide_vec.h @@ -0,0 +1,56 @@ +#ifndef HALIDE__idw_halide_vec_h +#define HALIDE__idw_halide_vec_h +#include + +// Forward declarations of the types used in the interface +// to the Halide pipeline. +// +// For the definitions of these structs, include HalideRuntime.h + +// Halide's representation of a multi-dimensional array. +// Halide::Runtime::Buffer is a more user-friendly wrapper +// around this. Its declaration is in HalideBuffer.h +struct halide_buffer_t; + +// Metadata describing the arguments to the generated function. +// Used to construct calls to the _argv version of the function. +struct halide_filter_metadata_t; + +#ifndef HALIDE_MUST_USE_RESULT +#ifdef __has_attribute +#if __has_attribute(nodiscard) +#define HALIDE_MUST_USE_RESULT [[nodiscard]] +#elif __has_attribute(warn_unused_result) +#define HALIDE_MUST_USE_RESULT __attribute__((warn_unused_result)) +#else +#define HALIDE_MUST_USE_RESULT +#endif +#else +#define HALIDE_MUST_USE_RESULT +#endif +#endif + +#ifndef HALIDE_FUNCTION_ATTRS +#define HALIDE_FUNCTION_ATTRS +#endif + + + +#ifdef __cplusplus +extern "C" { +#endif + +HALIDE_FUNCTION_ATTRS +int idw_halide_vec_(struct halide_buffer_t *_idw_halide_vec__buffer); + +HALIDE_FUNCTION_ATTRS +int idw_halide_vec__argv(void **args); + +HALIDE_FUNCTION_ATTRS +const struct halide_filter_metadata_t *idw_halide_vec__metadata(); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/aot/idw_halide_vec.s b/aot/idw_halide_vec.s new file mode 100644 index 0000000..f595e22 --- /dev/null +++ b/aot/idw_halide_vec.s @@ -0,0 +1,450 @@ + .text + .attribute 4, 16 + .attribute 5, "rv64gcv0p7" + .file "halide_buffer_t.cpp" + .section .text.idw_halide_vec_,"ax",@progbits + .globl idw_halide_vec_ # -- Begin function idw_halide_vec_ + .p2align 1 + .type idw_halide_vec_,@function +idw_halide_vec_: # @idw_halide_vec_ +# %bb.0: # %entry + addi sp, sp, -192 + sd ra, 184(sp) # 8-byte Folded Spill + sd s0, 176(sp) # 8-byte Folded Spill + sd s1, 168(sp) # 8-byte Folded Spill + sd s2, 160(sp) # 8-byte Folded Spill + sd s3, 152(sp) # 8-byte Folded Spill + sd s4, 144(sp) # 8-byte Folded Spill + sd s5, 136(sp) # 8-byte Folded Spill + sd s6, 128(sp) # 8-byte Folded Spill + sd s7, 120(sp) # 8-byte Folded Spill + sd s8, 112(sp) # 8-byte Folded Spill + sd s9, 104(sp) # 8-byte Folded Spill + sd s10, 96(sp) # 8-byte Folded Spill + sd s11, 88(sp) # 8-byte Folded Spill +.Lpcrel_hi0: + auipc a1, %pcrel_hi(.Lb44.buffer) + addi a1, a1, %pcrel_lo(.Lpcrel_hi0) + ld s6, 16(a1) + ld a3, 40(a0) +.Lpcrel_hi1: + auipc a2, %pcrel_hi(.Lb45.buffer) + addi a2, a2, %pcrel_lo(.Lpcrel_hi1) + ld s9, 16(a2) + lw s4, 0(a3) + lwu s8, 4(a3) + lw a4, 16(a3) + sd a4, 8(sp) # 8-byte Folded Spill + ld a4, 0(a1) + lwu a5, 20(a3) + sd a5, 32(sp) # 8-byte Folded Spill + lw s1, 24(a3) + or a5, s6, a4 + mv a4, s9 + bnez a5, .LBB0_2 +# %bb.1: # %then_bb + lui a4, 128 + addiw a4, a4, 9 + slli a4, a4, 13 + sd a4, 32(a1) + ld a4, 40(a1) + sd zero, 0(a1) + sd zero, 8(a1) + sd zero, 16(a1) + sw zero, 0(a4) + li a5, 300 + sw a5, 4(a4) + li a5, 1 + sw a5, 8(a4) + sw zero, 12(a4) + ld a4, 16(a2) + sd zero, 24(a1) +.LBB0_2: # %after_bb + ld a5, 0(a2) + ld s11, 16(a0) + or a4, a4, a5 + bnez a4, .LBB0_4 +# %bb.3: # %then_bb2 + sd zero, 16(a2) + sd zero, 8(a2) + sd zero, 0(a2) + lui a4, 128 + addiw a4, a4, 9 + ld a5, 40(a2) + slli a4, a4, 13 + addi a4, a4, 2 + sd a4, 32(a2) + sw zero, 0(a5) + li a4, 100 + sw a4, 4(a5) + li a4, 1 + sw a4, 8(a5) + sw zero, 12(a5) + sd zero, 24(a2) +.LBB0_4: # %after_bb1 + sext.w a4, s8 + sd a4, 24(sp) # 8-byte Folded Spill + lw a4, 32(sp) # 8-byte Folded Reload + sd s1, 16(sp) # 8-byte Folded Spill + beqz s11, .LBB0_6 +# %bb.5: + li a0, 0 + j .LBB0_9 +.LBB0_6: # %_halide_buffer_is_bounds_query.exit36 + ld a5, 0(a0) + bnez a5, .LBB0_8 +# %bb.7: # %then_bb5 + sd zero, 16(a0) + sd zero, 8(a0) + sd zero, 0(a0) + lui a5, 256 + addiw a5, a5, 9 + slli a5, a5, 13 + addi a5, a5, 2 + sd a5, 32(a0) + sw s4, 0(a3) + ld s1, 24(sp) # 8-byte Folded Reload + sw s1, 4(a3) + li a5, 1 + sw a5, 8(a3) + sw zero, 12(a3) + ld a5, 8(sp) # 8-byte Folded Reload + sw a5, 16(a3) + sw a4, 20(a3) + sw s1, 24(a3) + sw zero, 28(a3) + sd zero, 24(a0) +.LBB0_8: # %land.rhs.i49 + ld a0, 0(a0) + seqz a0, a0 +.LBB0_9: # %_halide_buffer_is_bounds_query.exit50 + ld a3, 16(a1) + ld a1, 0(a1) + ld a5, 16(a2) + ld a2, 0(a2) + or a1, a1, a3 + seqz a1, a1 + or a2, a2, a5 + seqz a2, a2 + or a1, a1, a2 + or a0, a0, a1 + slti a1, a4, 1 + or a0, a0, a1 + bnez a0, .LBB0_20 +# %bb.10: # %"for idw_halide_vec_.s0.y.rebased.preheader" + ld s3, 16(sp) # 8-byte Folded Reload + ld a0, 24(sp) # 8-byte Folded Reload + blez a0, .LBB0_13 +# %bb.11: # %"for idw_halide_vec_.s0.y.rebased.us.preheader" + li s0, 0 + ld s2, 24(sp) # 8-byte Folded Reload + slli s2, s2, 2 + ld s1, 32(sp) # 8-byte Folded Reload +.LBB0_12: # %"for idw_halide_vec_.s0.y.rebased.us" + # =>This Inner Loop Header: Depth=1 + slli a0, s0, 2 + add a0, a0, s11 + li a1, 0 + mv a2, s2 + call memset@plt + addi s1, s1, -1 + addw s0, s0, s3 + bnez s1, .LBB0_12 +.LBB0_13: # %"for idw_halide_vec_.s1.y.rebased.preheader" + sd zero, 40(sp) # 8-byte Folded Spill + li a1, 6 + li a2, 1 + addi a3, sp, 64 + addi a7, sp, 76 + addi t0, sp, 72 + addi t1, sp, 68 + li s0, 4 + addi s1, sp, 48 + addi t2, sp, 52 + addi t3, sp, 60 + addi t6, sp, 56 + li ra, 2 + li a6, 3 +.LBB0_14: # %"for idw_halide_vec_.s1.y.rebased" + # =>This Loop Header: Depth=1 + # Child Loop BB0_16 Depth 2 + # Child Loop BB0_17 Depth 3 + ld a0, 24(sp) # 8-byte Folded Reload + blez a0, .LBB0_19 +# %bb.15: # %"for idw_halide_vec_.s1.x.rebased.preheader" + # in Loop: Header=BB0_14 Depth=1 + li t4, 0 + ld a0, 16(sp) # 8-byte Folded Reload + ld a4, 40(sp) # 8-byte Folded Reload + mulw t5, a0, a4 + ld a0, 8(sp) # 8-byte Folded Reload + addw a0, a0, a4 +.LBB0_16: # %"for idw_halide_vec_.s1.x.rebased" + # Parent Loop BB0_14 Depth=1 + # => This Loop Header: Depth=2 + # Child Loop BB0_17 Depth 3 + add a4, t4, t5 + slli a4, a4, 2 + add s2, s11, a4 + flw ft0, 0(s2) + addw a4, s4, t4 + li s5, 25 + mv s3, s6 + mv s10, s9 +.LBB0_17: # %"for idw_halide_vec_.s1.r24$x.r24$x" + # Parent Loop BB0_14 Depth=1 + # Parent Loop BB0_16 Depth=2 + # => This Inner Loop Header: Depth=3 + addi s7, s3, 4 + vsetvli zero, a1, e32, m2 + vlwu.v v8, (s7) + vsetvli zero, a2, e32, m2 + vslidedown.vi v10, v8, 3 + addi a5, s3, 20 + vsetvli zero, a1, e32, m2 + vlwu.v v12, (a5) + vsetvli zero, a2, e32, m2 + vslidedown.vi v14, v12, 2 + vslidedown.vi v12, v12, 5 + vsw.v v8, (a3) + vsw.v v12, (a7) + vsw.v v14, (t0) + vsw.v v10, (t1) + vsetvli zero, s0, e32, m1 + vlwu.v v8, (a3) + vsetvli a5, zero, e32, m1 + vrsub.vx v8, v8, a4 + vsetvli zero, a1, e32, m2 + vlwu.v v10, (s3) + vsetvli zero, a2, e32, m2 + vslidedown.vi v12, v10, 3 + addi a5, s3, 16 + vsetvli zero, a1, e32, m2 + vlwu.v v14, (a5) + vsetvli zero, a2, e32, m2 + vslidedown.vi v16, v14, 2 + vslidedown.vi v14, v14, 5 + vsw.v v10, (s1) + vsw.v v12, (t2) + vsw.v v14, (t3) + vsw.v v16, (t6) + vsetvli zero, s0, e32, m1 + vlwu.v v9, (s1) + vsetvli a5, zero, e32, m1 + vrsub.vx v9, v9, a0 + vmul.vv v9, v9, v9 + vmacc.vv v9, v8, v8 + vfcvt.f.x.v v8, v9 + vfmv.f.s ft1, v8 + fsqrt.s ft1, ft1 + vfmv.s.f v9, ft1 + vsetvli zero, a2, e32, m1 + vslidedown.vi v10, v8, 1 + vfmv.f.s ft1, v10 + fsqrt.s ft1, ft1 + vsetvli a5, zero, e32, m1 + vfmv.s.f v10, ft1 + vsetvli zero, ra, e32, m1 + vslideup.vi v9, v10, 1 + vsetvli zero, a2, e32, m1 + vslidedown.vi v10, v8, 2 + vfmv.f.s ft1, v10 + fsqrt.s ft1, ft1 + vsetvli a5, zero, e32, m1 + vfmv.s.f v10, ft1 + vsetvli zero, a6, e32, m1 + vslideup.vi v9, v10, 2 + vsetvli zero, a2, e32, m1 + vslidedown.vi v8, v8, 3 + vfmv.f.s ft1, v8 + fsqrt.s ft1, ft1 + vsetvli a5, zero, e32, m1 + vfmv.s.f v8, ft1 + vsetvli zero, s0, e32, m1 + vslideup.vi v9, v8, 3 + vsetvli zero, zero, e32, m1 + vlwu.v v8, (s10) + vfmul.vv v8, v9, v8 + vfmv.s.f v9, ft0 + vfredsum.vs v8, v8, v9 + vfmv.f.s ft0, v8 + addi s5, s5, -1 + addi s10, s10, 16 + addi s3, s3, 48 + bnez s5, .LBB0_17 +# %bb.18: # %"end for idw_halide_vec_.s1.r24$x.r24$x" + # in Loop: Header=BB0_16 Depth=2 + addi t4, t4, 1 + fsw ft0, 0(s2) + bne t4, s8, .LBB0_16 +.LBB0_19: # %"end for idw_halide_vec_.s1.x.rebased" + # in Loop: Header=BB0_14 Depth=1 + ld a4, 40(sp) # 8-byte Folded Reload + addi a4, a4, 1 + ld a0, 32(sp) # 8-byte Folded Reload + sd a4, 40(sp) # 8-byte Folded Spill + bne a4, a0, .LBB0_14 +.LBB0_20: # %destructor_block + li a0, 0 + ld ra, 184(sp) # 8-byte Folded Reload + ld s0, 176(sp) # 8-byte Folded Reload + ld s1, 168(sp) # 8-byte Folded Reload + ld s2, 160(sp) # 8-byte Folded Reload + ld s3, 152(sp) # 8-byte Folded Reload + ld s4, 144(sp) # 8-byte Folded Reload + ld s5, 136(sp) # 8-byte Folded Reload + ld s6, 128(sp) # 8-byte Folded Reload + ld s7, 120(sp) # 8-byte Folded Reload + ld s8, 112(sp) # 8-byte Folded Reload + ld s9, 104(sp) # 8-byte Folded Reload + ld s10, 96(sp) # 8-byte Folded Reload + ld s11, 88(sp) # 8-byte Folded Reload + addi sp, sp, 192 + ret +.Lfunc_end0: + .size idw_halide_vec_, .Lfunc_end0-idw_halide_vec_ + # -- End function + .section .text.idw_halide_vec__argv,"ax",@progbits + .globl idw_halide_vec__argv # -- Begin function idw_halide_vec__argv + .p2align 1 + .type idw_halide_vec__argv,@function +idw_halide_vec__argv: # @idw_halide_vec__argv +# %bb.0: # %entry + addi sp, sp, -16 + sd ra, 8(sp) # 8-byte Folded Spill + ld a0, 0(a0) + call idw_halide_vec_@plt + li a0, 0 + ld ra, 8(sp) # 8-byte Folded Reload + addi sp, sp, 16 + ret +.Lfunc_end1: + .size idw_halide_vec__argv, .Lfunc_end1-idw_halide_vec__argv + # -- End function + .section .text.idw_halide_vec__metadata,"ax",@progbits + .globl idw_halide_vec__metadata # -- Begin function idw_halide_vec__metadata + .p2align 1 + .type idw_halide_vec__metadata,@function +idw_halide_vec__metadata: # @idw_halide_vec__metadata +# %bb.0: # %entry +.Lpcrel_hi2: + auipc a0, %pcrel_hi(.Lidw_halide_vec__metadata_storage) + addi a0, a0, %pcrel_lo(.Lpcrel_hi2) + ret +.Lfunc_end2: + .size idw_halide_vec__metadata, .Lfunc_end2-idw_halide_vec__metadata + # -- End function + .type .Lb44.shape,@object # @b44.shape + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lb44.shape: + .asciz "\000\000\000\000,\001\000\000\001\000\000\000\000\000\000" + .size .Lb44.shape, 16 + + .type .Lb44.data,@object # @b44.data + .p2align 5, 0x0 +.Lb44.data: + .asciz "\000\000\000\000\000\000\000\000H\000\000\000\000\000\000\000\325\000\000\000O\000\000\000\000\000\000\000\252\001\000\000<\000\000\000\000\000\000\000\200\002\000\000L\000\000\000\000\000\000\000U\003\000\000\200\000\000\000\000\000\000\000*\004\000\000C\000\000\000\000\000\000\000\000\005\000\000A\000\000\000\000\000\000\000\325\005\000\000@\000\000\000\000\000\000\000\252\006\000\000<\000\000\000\000\000\000\000\200\007\000\000=\000\000\000x\000\000\000\000\000\000\000Q\000\000\000x\000\000\000\325\000\000\000O\000\000\000x\000\000\000\252\001\000\000:\000\000\000x\000\000\000\200\002\000\000\204\000\000\000x\000\000\000U\003\000\000\225\000\000\000x\000\000\000*\004\000\000\216\000\000\000x\000\000\000\000\005\000\000@\000\000\000x\000\000\000\325\005\000\000E\000\000\000x\000\000\000\252\006\000\000A\000\000\000x\000\000\000\200\007\000\000@\000\000\000\360\000\000\000\000\000\000\000K\000\000\000\360\000\000\000\325\000\000\000D\000\000\000\360\000\000\000\252\001\000\000\214\000\000\000\360\000\000\000\200\002\000\000\231\000\000\000\360\000\000\000U\003\000\000\221\000\000\000\360\000\000\000*\004\000\000\204\000\000\000\360\000\000\000\000\005\000\000\230\000\000\000\360\000\000\000\325\005\000\000}\000\000\000\360\000\000\000\252\006\000\000B\000\000\000\360\000\000\000\200\007\000\000:\000\000\000h\001\000\000\000\000\000\000N\000\000\000h\001\000\000\325\000\000\000<\000\000\000h\001\000\000\252\001\000\000\213\000\000\000h\001\000\000\200\002\000\000\250\000\000\000h\001\000\000U\003\000\000\232\000\000\000h\001\000\000*\004\000\000\212\000\000\000h\001\000\000\000\005\000\000\221\000\000\000h\001\000\000\325\005\000\000\240\000\000\000h\001\000\000\252\006\000\000D\000\000\000h\001\000\000\200\007\000\000<\000\000\000\340\001\000\000\000\000\000\000M\000\000\000\340\001\000\000\325\000\000\000;\000\000\000\340\001\000\000\252\001\000\000\245\000\000\000\340\001\000\000\200\002\000\000\267\000\000\000\340\001\000\000U\003\000\000\246\000\000\000\340\001\000\000*\004\000\000\216\000\000\000\340\001\000\000\000\005\000\000{\000\000\000\340\001\000\000\325\005\000\000\233\000\000\000\340\001\000\000\252\006\000\000\220\000\000\000\340\001\000\000\200\007\000\000<\000\000\000X\002\000\000\000\000\000\000S\000\000\000X\002\000\000\325\000\000\000A\000\000\000X\002\000\000\252\001\000\000\262\000\000\000X\002\000\000\200\002\000\000\270\000\000\000X\002\000\000U\003\000\000\212\000\000\000X\002\000\000*\004\000\000|\000\000\000X\002\000\000\000\005\000\000\204\000\000\000X\002\000\000\325\005\000\000\257\000\000\000X\002\000\000\252\006\000\000\257\000\000\000X\002\000\000\200\007\000\000<\000\000\000\320\002\000\000\000\000\000\000V\000\000\000\320\002\000\000\325\000\000\000<\000\000\000\320\002\000\000\252\001\000\000\266\000\000\000\320\002\000\000\200\002\000\000\263\000\000\000\320\002\000\000U\003\000\000\230\000\000\000\320\002\000\000*\004\000\000\214\000\000\000\320\002\000\000\000\005\000\000s\000\000\000\320\002\000\000\325\005\000\000\234\000\000\000\320\002\000\000\252\006\000\000\254\000\000\000\320\002\000\000\200\007\000\000A\000\000\000H\003\000\000\000\000\000\000Z\000\000\000H\003\000\000\325\000\000\000E\000\000\000H\003\000\000\252\001\000\000\244\000\000\000H\003\000\000\200\002\000\000\272\000\000\000H\003\000\000U\003\000\000\222\000\000\000H\003\000\000*\004\000\000\223\000\000\000H\003\000\000\000\005\000\000z\000\000\000H\003\000\000\325\005\000\000\230\000\000\000H\003\000\000\252\006\000\000\246\000\000\000H\003\000\000\200\007\000\000E\000\000\000\300\003\000\000\000\000\000\000U\000\000\000\300\003\000\000\325\000\000\000C\000\000\000\300\003\000\000\252\001\000\000\272\000\000\000\300\003\000\000\200\002\000\000\251\000\000\000\300\003\000\000U\003\000\000\242\000\000\000\300\003\000\000*\004\000\000\223\000\000\000\300\003\000\000\000\005\000\000|\000\000\000\300\003\000\000\325\005\000\000\225\000\000\000\300\003\000\000\252\006\000\000\235\000\000\000\300\003\000\000\200\007\000\000E\000\000\0008\004\000\000\000\000\000\000U\000\000\0008\004\000\000\325\000\000\000F\000\000\0008\004\000\000\252\001\000\000\301\000\000\0008\004\000\000\200\002\000\000\300\000\000\0008\004\000\000U\003\000\000\233\000\000\0008\004\000\000*\004\000\000\207\000\000\0008\004\000\000\000\005\000\000x\000\000\0008\004\000\000\325\005\000\000\212\000\000\0008\004\000\000\252\006\000\000\200\000\000\0008\004\000\000\200\007\000\000F\000\000" + .size .Lb44.data, 1200 + + .type .Lb44.buffer,@object # @b44.buffer + .data + .p2align 4, 0x0 +.Lb44.buffer: + .quad 0 # 0x0 + .quad 0 + .quad .Lb44.data + .quad 1 # 0x1 + .byte 0 # 0x0 + .byte 32 # 0x20 + .half 1 # 0x1 + .word 1 # 0x1 + .quad .Lb44.shape + .quad 0 + .size .Lb44.buffer, 56 + + .type .Lb45.shape,@object # @b45.shape + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lb45.shape: + .asciz "\000\000\000\000d\000\000\000\001\000\000\000\000\000\000" + .size .Lb45.shape, 16 + + .type .Lb45.data,@object # @b45.data + .p2align 5, 0x0 +.Lb45.data: + .ascii "\211\004&=\244\022\233\275\217U%;\272)J>#\026\024\276@_\234>\340\243|\275\334\034K\275\232\300\004:A\313\245\274\303\024r\275\345\313a\275x\231\300>\201\334\"\276\236}\321\275H\346\257\276\202V\310>\225\375(>},\026\274E\235W\275\241\236\236<\245\220\222=\004\361\263\276\2620\240\274\265\036\240=\\:\336=\240\016\263\276\260\224\371\275p\235(=\311\\x;\226y\\\275\360\227\377=\275\364\200=rw/<@U\017=c\327\003=nV\267\273\3543\207\311\274<&{\274h\b\031>\203\222\003\276\213\3649\275d\177\017\276\330\306\246\275'\314:>\203\024\253=S\331M\276\257\263S=b\221q\275\336\t\306=#^\005\2763)\256\274\344\255G>\224!\374=\374\360o\275\215#\031\276\322`J\276y6\360=\273\303;\275N\"N>M\022K\276\016\035C=1Fb\275u\017\220\275\0040\033>,\037\223=t\335\304\275\363\344\271=\320\341\260\275\2265\255=\310\"b=&\031\330\275;\314\000>>\347o\275\255\222\020=C\rU<\370d\307\275\373\306\206=I\345\373\274Y|\034>WY;\276O\240?>B\347\226\275\203\271\203\275W\243\313<\214\364\343\274Ey\033\276J4\214=\261\304\241\274-\271p>X\313l\276*_\366\275\260^\\=\237l1=\371\272\200=\002\373?\274*\3578<]Gv=" + .size .Lb45.data, 400 + + .type .Lb45.buffer,@object # @b45.buffer + .data + .p2align 4, 0x0 +.Lb45.buffer: + .quad 0 # 0x0 + .quad 0 + .quad .Lb45.data + .quad 1 # 0x1 + .byte 2 # 0x2 + .byte 32 # 0x20 + .half 1 # 0x1 + .word 1 # 0x1 + .quad .Lb45.shape + .quad 0 + .size .Lb45.buffer, 56 + + .type .L__unnamed_1,@object # @0 + .section .rodata,"a",@progbits + .p2align 4, 0x0 +.L__unnamed_1: + .zero 32 + .size .L__unnamed_1, 32 + + .type .Lstr,@object # @str + .p2align 5, 0x0 +.Lstr: + .asciz "idw_halide_vec_" + .size .Lstr, 16 + + .type .L__unnamed_2,@object # @1 + .section .data.rel.ro,"aw",@progbits + .p2align 4, 0x0 +.L__unnamed_2: + .quad .Lstr + .word 2 # 0x2 + .word 2 # 0x2 + .byte 2 # 0x2 + .byte 32 # 0x20 + .half 1 # 0x1 + .zero 4 + .quad 0 + .quad 0 + .quad 0 + .quad 0 + .quad .L__unnamed_1 + .size .L__unnamed_2, 64 + + .type .Lstr.4,@object # @str.4 + .section .rodata,"a",@progbits + .p2align 5, 0x0 +.Lstr.4: + .asciz "riscv-64-linux-no_asserts-no_runtime-rvv-vector_bits_128" + .size .Lstr.4, 57 + + .type .Lidw_halide_vec__metadata_storage,@object # @idw_halide_vec__metadata_storage + .section .data.rel.ro,"aw",@progbits + .p2align 4, 0x0 +.Lidw_halide_vec__metadata_storage: + .word 1 # 0x1 + .word 1 # 0x1 + .quad .L__unnamed_2 + .quad .Lstr.4 + .quad .Lstr + .size .Lidw_halide_vec__metadata_storage, 32 + + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .ident "clang version 16.0.0 (https://github.com/dkurt/llvm-rvv-071 b027aa1b59c9f53240bdc836f39656723fdf9df0)" + .section ".note.GNU-stack","",@progbits diff --git a/include/algos.hpp b/include/algos.hpp index d6c9f0c..ce614ad 100644 --- a/include/algos.hpp +++ b/include/algos.hpp @@ -1,47 +1,53 @@ -// Copyright (C), 2023, KNS Group LLC (YADRO) - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -void histogram_ref(const uint8_t* src, int32_t* dst, int height, int width); -void histogram_halide(uint8_t* src, int32_t* dst, int height, int width); -void histogram_opencv(const cv::Mat& src, cv::Mat& dst); - -void bgr2gray_ref(const uint8_t* src, uint8_t* dst, int height, int width); -void bgr2gray_interleaved_halide(uint8_t* src, uint8_t* dst, int height, int width); -void bgr2gray_planar_halide(uint8_t* src, uint8_t* dst, int height, int width); -void bgr2gray_opencv(const cv::Mat& src, cv::Mat& dst); - -void boxFilter_halide(uint16_t* src, uint16_t* dst, int height, int width); -void boxFilter_opencv(const cv::Mat& src, cv::Mat& dst); -void ascii_art_ref(const uint8_t* src, uint8_t* dst, int height, int width); -void ascii_art_halide(uint8_t* src, uint8_t* dst, int input_height, int input_width); - -void julia_ref(uint8_t* dst, int height, int width); -void halide_julia(uint8_t* dst, int height, int width); - -#ifdef HAVE_OPENCV_DNN -void convolution_nchw_halide(float* src, float* kernel, float* dst, - int inpChannels, int outChannels, int height, int width); -void convolution_nhwc_halide(float* src, float* kernel, float* dst, - int inpChannels, int outChannels, int height, int width); -void convolution_opencv(const cv::Mat& src, const cv::Mat& weights, cv::Mat& dst, - int inpChannels, int outChannels); -#endif // HAVE_OPENCV_DNN - -void idw_ref(const uint8_t* src, uint8_t* dst, int height, int width, int* points, float* weights); - -void idw_halide(const uint8_t* src, uint8_t* dst, int height, int width, int* pointsBuf, float* weightsBuf); - -void voxel_up(float* src, float* kernel, float* dst, - int inpChannels, int height, int width, int depth); -void upscale(const std::vector img_path, int width, int height); +// Copyright (C), 2023, KNS Group LLC (YADRO) + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +void histogram_ref(const uint8_t* src, int32_t* dst, int height, int width); +void histogram_halide(uint8_t* src, int32_t* dst, int height, int width); +void histogram_opencv(const cv::Mat& src, cv::Mat& dst); + +void bgr2gray_ref(const uint8_t* src, uint8_t* dst, int height, int width); +void bgr2gray_interleaved_halide(uint8_t* src, uint8_t* dst, int height, int width); +void bgr2gray_planar_halide(uint8_t* src, uint8_t* dst, int height, int width); +void bgr2gray_opencv(const cv::Mat& src, cv::Mat& dst); + +void boxFilter_halide(uint16_t* src, uint16_t* dst, int height, int width); +void boxFilter_opencv(const cv::Mat& src, cv::Mat& dst); +void ascii_art_ref(const uint8_t* src, uint8_t* dst, int height, int width); +void ascii_art_halide(uint8_t* src, uint8_t* dst, int input_height, int input_width); + +void julia_ref(uint8_t* dst, int height, int width); +void halide_julia(uint8_t* dst, int height, int width); + +#ifdef HAVE_OPENCV_DNN +void convolution_nchw_halide(float* src, float* kernel, float* dst, + int inpChannels, int outChannels, int height, int width); +void convolution_nhwc_halide(float* src, float* kernel, float* dst, + int inpChannels, int outChannels, int height, int width); +void convolution_opencv(const cv::Mat& src, const cv::Mat& weights, cv::Mat& dst, + int inpChannels, int outChannels); +#endif // HAVE_OPENCV_DNN + +void idw_ref(const uint8_t* src, uint8_t* dst, int height, int width, int* points, float* weights); + +void idw_halide(const uint8_t* src, uint8_t* dst, int height, int width, int* pointsBuf, float* weightsBuf); + +void idw_halide_parallel(const uint8_t* src, uint8_t* dst, int height, int width, int* pointsBuf, float* weightsBuf); + +void idw_halide_vec(const uint8_t* src, uint8_t* dst, int height, int width, int* pointsBuf, float* weightsBuf); + +void idw_halide_parallel_vec(const uint8_t* src, uint8_t* dst, int height, int width, int* pointsBuf, float* weightsBuf); + +void voxel_up(float* src, float* kernel, float* dst, + int inpChannels, int height, int width, int depth); +void upscale(const std::vector img_path, int width, int height); diff --git a/perf/perf_main.cpp b/perf/perf_main.cpp index 0377cea..61b5253 100644 --- a/perf/perf_main.cpp +++ b/perf/perf_main.cpp @@ -1,257 +1,309 @@ -// Copyright (C), 2023, KNS Group LLC (YADRO) - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "algos.hpp" - -using namespace cv; - -CV_PERF_TEST_MAIN("") - -Mat src(1080, 1920, CV_8UC3); - -static const int julia_width = 200; -static const int julia_height = 200; - -PERF_TEST(julia, halide) { - Mat dst(julia_height, julia_width, CV_8UC1, Scalar(0)); - - PERF_SAMPLE_BEGIN() - halide_julia(dst.ptr(), dst.rows, dst.cols); - PERF_SAMPLE_END() - - SANITY_CHECK_NOTHING(); -} - -PERF_TEST(julia, reference) { - Mat dst(julia_height, julia_width, CV_8UC1, Scalar(0)); - - PERF_SAMPLE_BEGIN() - julia_ref(dst.ptr(), dst.rows, dst.cols); - PERF_SAMPLE_END() - - SANITY_CHECK_NOTHING(); -} - -PERF_TEST(histogram, reference) { - Mat dst(3, 256, CV_32S); - randu(src, 0, 256); - - PERF_SAMPLE_BEGIN() - histogram_ref(src.ptr(), dst.ptr(), src.rows, src.cols); - PERF_SAMPLE_END() - - SANITY_CHECK_NOTHING(); -} - -PERF_TEST(histogram, opencv) { - Mat dst(3, 256, CV_32F); - randu(src, 0, 256); - - PERF_SAMPLE_BEGIN() - histogram_opencv(src, dst); - PERF_SAMPLE_END() - - SANITY_CHECK_NOTHING(); -} - -PERF_TEST(histogram, halide) { - Mat dst(3, 256, CV_32S); - randu(src, 0, 256); - - PERF_SAMPLE_BEGIN() - histogram_halide(src.ptr(), dst.ptr(), src.rows, src.cols); - PERF_SAMPLE_END() - - SANITY_CHECK_NOTHING(); -} - -PERF_TEST(bgr2gray, reference) { - Mat dst(src.size(), CV_8U); - randu(src, 0, 256); - - PERF_SAMPLE_BEGIN() - bgr2gray_ref(src.ptr(), dst.ptr(), src.rows, src.cols); - PERF_SAMPLE_END() - - SANITY_CHECK_NOTHING(); -} - -PERF_TEST(bgr2gray, opencv) { - Mat dst(src.size(), CV_8U); - randu(src, 0, 256); - - PERF_SAMPLE_BEGIN() - bgr2gray_opencv(src, dst); - PERF_SAMPLE_END() - - SANITY_CHECK_NOTHING(); -} - -PERF_TEST(bgr2gray_interleaved, halide) { - randu(src, 0, 256); - Mat dst(src.size(), CV_8U); - - PERF_SAMPLE_BEGIN() - bgr2gray_interleaved_halide(src.ptr(), dst.ptr(), src.rows, src.cols); - PERF_SAMPLE_END() - - SANITY_CHECK_NOTHING(); -} - -PERF_TEST(bgr2gray_planar, halide) { - randu(src, 0, 256); - Mat dst(src.size(), CV_8U); - - PERF_SAMPLE_BEGIN() - bgr2gray_planar_halide(src.ptr(), dst.ptr(), src.rows, src.cols); - PERF_SAMPLE_END() - - SANITY_CHECK_NOTHING(); -} - -PERF_TEST(boxFilter, halide) { - Mat src16(src.size(), CV_16U); - Mat dst(src.rows - 2, src.cols - 2, CV_16U); - randu(src16, 0, 256); - - PERF_SAMPLE_BEGIN() - boxFilter_halide(src16.ptr(), dst.ptr(), src.rows, src.cols); - PERF_SAMPLE_END() - - SANITY_CHECK_NOTHING(); -} - -PERF_TEST(boxFilter, opencv) { - Mat srcCh(src.size(), CV_8U); - Mat dst(src.size(), CV_8U); - randu(src, 0, 256); - - PERF_SAMPLE_BEGIN() - boxFilter_opencv(srcCh, dst); - PERF_SAMPLE_END() - - SANITY_CHECK_NOTHING(); -} - -#ifdef HAVE_OPENCV_DNN - -PERF_TEST(convolution, opencv) { - static const int ic = 16; - static const int oc = 32; - static const int height = 128; - static const int width = 128; - Mat src({1, ic, height, width}, CV_32F); - Mat kernel({oc, ic, 3, 3}, CV_32F); - Mat dst({1, oc, height - 1, width - 1}, CV_32F); - randn(src, 0, 1); - randn(kernel, 0, 1); - - PERF_SAMPLE_BEGIN() - convolution_opencv(src, kernel, dst, ic, oc); - PERF_SAMPLE_END() - - SANITY_CHECK_NOTHING(); -} - -PERF_TEST(convolution_nchw, halide) { - static const int ic = 16; - static const int oc = 32; - static const int height = 128; - static const int width = 128; - Mat src({1, ic, height, width}, CV_32F); - Mat kernel({oc, ic, 3, 3}, CV_32F); - Mat dst({1, oc, height - 2, width - 2}, CV_32F); - randn(src, 0, 1); - randn(kernel, 0, 1); - - PERF_SAMPLE_BEGIN() - convolution_nchw_halide(src.ptr(), kernel.ptr(), dst.ptr(), - ic, oc, height, width); - PERF_SAMPLE_END() - - SANITY_CHECK_NOTHING(); -} - - -PERF_TEST(convolution_nhwc, halide) { - static const int ic = 16; - static const int oc = 32; - static const int height = 128; - static const int width = 128; - Mat src({1, height, width, ic}, CV_32F); - Mat kernel({oc, ic, 3, 3}, CV_32F); - Mat dst({1, height - 2, width - 2, oc}, CV_32F); - randn(src, 0, 1); - randn(kernel, 0, 1); - - PERF_SAMPLE_BEGIN() - convolution_nhwc_halide(src.ptr(), kernel.ptr(), dst.ptr(), - ic, oc, height, width); - PERF_SAMPLE_END() - - SANITY_CHECK_NOTHING(); -} - -#endif // HAVE_OPENCV_DNN - - -// 300 elems (100 points) -static int idwPoints[] = {0, 0, 72, 0, 213, 79, 0, 426, 60, 0, 640, 76, 0, 853, 128, 0, 1066, 67, 0, 1280, 65, 0, 1493, 64, 0, 1706, 60, 0, 1920, 61, 120, 0, 81, 120, 213, 79, 120, 426, 58, 120, 640, 132, 120, 853, 149, 120, 1066, 142, 120, 1280, 64, 120, 1493, 69, 120, 1706, 65, 120, 1920, 64, 240, 0, 75, 240, 213, 68, 240, 426, 140, 240, 640, 153, 240, 853, 145, 240, 1066, 132, 240, 1280, 152, 240, 1493, 125, 240, 1706, 66, 240, 1920, 58, 360, 0, 78, 360, 213, 60, 360, 426, 139, 360, 640, 168, 360, 853, 154, 360, 1066, 138, 360, 1280, 145, 360, 1493, 160, 360, 1706, 68, 360, 1920, 60, 480, 0, 77, 480, 213, 59, 480, 426, 165, 480, 640, 183, 480, 853, 166, 480, 1066, 142, 480, 1280, 123, 480, 1493, 155, 480, 1706, 144, 480, 1920, 60, 600, 0, 83, 600, 213, 65, 600, 426, 178, 600, 640, 184, 600, 853, 138, 600, 1066, 124, 600, 1280, 132, 600, 1493, 175, 600, 1706, 175, 600, 1920, 60, 720, 0, 86, 720, 213, 60, 720, 426, 182, 720, 640, 179, 720, 853, 152, 720, 1066, 140, 720, 1280, 115, 720, 1493, 156, 720, 1706, 172, 720, 1920, 65, 840, 0, 90, 840, 213, 69, 840, 426, 164, 840, 640, 186, 840, 853, 146, 840, 1066, 147, 840, 1280, 122, 840, 1493, 152, 840, 1706, 166, 840, 1920, 69, 960, 0, 85, 960, 213, 67, 960, 426, 186, 960, 640, 169, 960, 853, 162, 960, 1066, 147, 960, 1280, 124, 960, 1493, 149, 960, 1706, 157, 960, 1920, 69, 1080, 0, 85, 1080, 213, 70, 1080, 426, 193, 1080, 640, 192, 1080, 853, 155, 1080, 1066, 135, 1080, 1280, 120, 1080, 1493, 138, 1080, 1706, 128, 1080, 1920, 70}; -// 100 elems -float idwWeights[] = {0.04053167, -0.07571915, 0.0025228, 0.1974248, -0.14461569, 0.30541419, -0.06167972, -0.04958807, 0.00050641, -0.02023852, -0.05910183, -0.05512609, 0.37617088, -0.15904428, -0.10229038, -0.34355378, 0.39128499, 0.16502984, -0.00916588, -0.05264022, 0.01936275, 0.07156495, -0.35144818, -0.01955447, 0.07818357, 0.10850975, -0.34972095, -0.12186563, 0.04116577, 0.00378971, -0.05382689, 0.12480152, 0.0629668, 0.01070963, 0.03499341, 0.03218783, -0.00559502, -0.21214646, 0.34402987, -0.0246006, -0.01532894, 0.14944613, -0.12848858, -0.04539923, -0.1401344, -0.08143395, 0.18241941, 0.08353522, -0.20102434, 0.05168503, -0.05897654, 0.09669851, -0.13024191, -0.02125988, 0.19499928, 0.12311092, -0.05857943, -0.14954968, -0.19763496, 0.1172914, -0.04584096, 0.20130274, -0.198312, 0.04763513, -0.05524272, -0.07034198, 0.15155036, 0.0718368, -0.09612551, 0.09076872, -0.0863682, 0.08457486, 0.05520895, -0.10551672, 0.12577908, -0.05857014, 0.03529613, 0.01300365, -0.09736055, 0.06580921, -0.03074898, 0.1528181, -0.18295799, 0.18713497, -0.07368328, -0.06431868, 0.02485816, -0.02782657, -0.15182979, 0.06845911, -0.01974711, 0.23508139, -0.23124445, -0.1202987, 0.05380124, 0.04331648, 0.06285662, -0.01171756, 0.01128749, 0.06012665}; - -PERF_TEST(idw, reference) { - const int width = 1920; - const int height = 1080; - - Mat dst(height, width, CV_8U); - - PERF_SAMPLE_BEGIN() - idw_ref(NULL, dst.ptr(), height, width, idwPoints, idwWeights); - PERF_SAMPLE_END() - - SANITY_CHECK_NOTHING(); -} - -PERF_TEST(idw, halide) { - const int width = 1920; - const int height = 1080; - - Mat dst(height, width, CV_8U); - - PERF_SAMPLE_BEGIN() - idw_halide(NULL, dst.ptr(), height, width, idwPoints, idwWeights); - PERF_SAMPLE_END() - - SANITY_CHECK_NOTHING(); -} - -PERF_TEST(voxel_up, halide) { - static const int ic = 4; - static const int height = 100; - static const int width = 100; - static const int batch = 72; - Mat src({width, height,ic, batch}, CV_32F); - Mat kernel({4, 4, 4, ic}, CV_32F); - Mat dst({width*2, height*2,ic, batch*2}, CV_32F); - randn(src, 0, 1); - randn(kernel, 0, 1); - - PERF_SAMPLE_BEGIN() - voxel_up(src.ptr(), kernel.ptr(), dst.ptr(), - ic, width, height, batch); - PERF_SAMPLE_END() - - SANITY_CHECK_NOTHING(); -} +// Copyright (C), 2023, KNS Group LLC (YADRO) + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "algos.hpp" + +// #ifdef __riscv +// #include +// #include "idw.h" +// using namespace Halide::Runtime; +// #else +// #include +// using namespace Halide; +// #endif + +// #include + +using namespace cv; + +// using namespace std; + +CV_PERF_TEST_MAIN("") + +Mat src(1080, 1920, CV_8UC3); + +static const int julia_width = 200; +static const int julia_height = 200; + +PERF_TEST(julia, halide) { + Mat dst(julia_height, julia_width, CV_8UC1, Scalar(0)); + + PERF_SAMPLE_BEGIN() + halide_julia(dst.ptr(), dst.rows, dst.cols); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST(julia, reference) { + Mat dst(julia_height, julia_width, CV_8UC1, Scalar(0)); + + PERF_SAMPLE_BEGIN() + julia_ref(dst.ptr(), dst.rows, dst.cols); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST(histogram, reference) { + Mat dst(3, 256, CV_32S); + randu(src, 0, 256); + + PERF_SAMPLE_BEGIN() + histogram_ref(src.ptr(), dst.ptr(), src.rows, src.cols); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST(histogram, opencv) { + Mat dst(3, 256, CV_32F); + randu(src, 0, 256); + + PERF_SAMPLE_BEGIN() + histogram_opencv(src, dst); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST(histogram, halide) { + Mat dst(3, 256, CV_32S); + randu(src, 0, 256); + + PERF_SAMPLE_BEGIN() + histogram_halide(src.ptr(), dst.ptr(), src.rows, src.cols); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST(bgr2gray, reference) { + Mat dst(src.size(), CV_8U); + randu(src, 0, 256); + + PERF_SAMPLE_BEGIN() + bgr2gray_ref(src.ptr(), dst.ptr(), src.rows, src.cols); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST(bgr2gray, opencv) { + Mat dst(src.size(), CV_8U); + randu(src, 0, 256); + + PERF_SAMPLE_BEGIN() + bgr2gray_opencv(src, dst); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST(bgr2gray_interleaved, halide) { + randu(src, 0, 256); + Mat dst(src.size(), CV_8U); + + PERF_SAMPLE_BEGIN() + bgr2gray_interleaved_halide(src.ptr(), dst.ptr(), src.rows, src.cols); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST(bgr2gray_planar, halide) { + randu(src, 0, 256); + Mat dst(src.size(), CV_8U); + + PERF_SAMPLE_BEGIN() + bgr2gray_planar_halide(src.ptr(), dst.ptr(), src.rows, src.cols); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST(boxFilter, halide) { + Mat src16(src.size(), CV_16U); + Mat dst(src.rows - 2, src.cols - 2, CV_16U); + randu(src16, 0, 256); + + PERF_SAMPLE_BEGIN() + boxFilter_halide(src16.ptr(), dst.ptr(), src.rows, src.cols); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST(boxFilter, opencv) { + Mat srcCh(src.size(), CV_8U); + Mat dst(src.size(), CV_8U); + randu(src, 0, 256); + + PERF_SAMPLE_BEGIN() + boxFilter_opencv(srcCh, dst); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +#ifdef HAVE_OPENCV_DNN + +PERF_TEST(convolution, opencv) { + static const int ic = 16; + static const int oc = 32; + static const int height = 128; + static const int width = 128; + Mat src({1, ic, height, width}, CV_32F); + Mat kernel({oc, ic, 3, 3}, CV_32F); + Mat dst({1, oc, height - 1, width - 1}, CV_32F); + randn(src, 0, 1); + randn(kernel, 0, 1); + + PERF_SAMPLE_BEGIN() + convolution_opencv(src, kernel, dst, ic, oc); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST(convolution_nchw, halide) { + static const int ic = 16; + static const int oc = 32; + static const int height = 128; + static const int width = 128; + Mat src({1, ic, height, width}, CV_32F); + Mat kernel({oc, ic, 3, 3}, CV_32F); + Mat dst({1, oc, height - 2, width - 2}, CV_32F); + randn(src, 0, 1); + randn(kernel, 0, 1); + + PERF_SAMPLE_BEGIN() + convolution_nchw_halide(src.ptr(), kernel.ptr(), dst.ptr(), + ic, oc, height, width); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + + +PERF_TEST(convolution_nhwc, halide) { + static const int ic = 16; + static const int oc = 32; + static const int height = 128; + static const int width = 128; + Mat src({1, height, width, ic}, CV_32F); + Mat kernel({oc, ic, 3, 3}, CV_32F); + Mat dst({1, height - 2, width - 2, oc}, CV_32F); + randn(src, 0, 1); + randn(kernel, 0, 1); + + PERF_SAMPLE_BEGIN() + convolution_nhwc_halide(src.ptr(), kernel.ptr(), dst.ptr(), + ic, oc, height, width); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +#endif // HAVE_OPENCV_DNN + + +// 300 elems (100 points) +static int idwPoints[] = {0, 0, 72, 0, 213, 79, 0, 426, 60, 0, 640, 76, 0, 853, 128, 0, 1066, 67, 0, 1280, 65, 0, 1493, 64, 0, 1706, 60, 0, 1920, 61, 120, 0, 81, 120, 213, 79, 120, 426, 58, 120, 640, 132, 120, 853, 149, 120, 1066, 142, 120, 1280, 64, 120, 1493, 69, 120, 1706, 65, 120, 1920, 64, 240, 0, 75, 240, 213, 68, 240, 426, 140, 240, 640, 153, 240, 853, 145, 240, 1066, 132, 240, 1280, 152, 240, 1493, 125, 240, 1706, 66, 240, 1920, 58, 360, 0, 78, 360, 213, 60, 360, 426, 139, 360, 640, 168, 360, 853, 154, 360, 1066, 138, 360, 1280, 145, 360, 1493, 160, 360, 1706, 68, 360, 1920, 60, 480, 0, 77, 480, 213, 59, 480, 426, 165, 480, 640, 183, 480, 853, 166, 480, 1066, 142, 480, 1280, 123, 480, 1493, 155, 480, 1706, 144, 480, 1920, 60, 600, 0, 83, 600, 213, 65, 600, 426, 178, 600, 640, 184, 600, 853, 138, 600, 1066, 124, 600, 1280, 132, 600, 1493, 175, 600, 1706, 175, 600, 1920, 60, 720, 0, 86, 720, 213, 60, 720, 426, 182, 720, 640, 179, 720, 853, 152, 720, 1066, 140, 720, 1280, 115, 720, 1493, 156, 720, 1706, 172, 720, 1920, 65, 840, 0, 90, 840, 213, 69, 840, 426, 164, 840, 640, 186, 840, 853, 146, 840, 1066, 147, 840, 1280, 122, 840, 1493, 152, 840, 1706, 166, 840, 1920, 69, 960, 0, 85, 960, 213, 67, 960, 426, 186, 960, 640, 169, 960, 853, 162, 960, 1066, 147, 960, 1280, 124, 960, 1493, 149, 960, 1706, 157, 960, 1920, 69, 1080, 0, 85, 1080, 213, 70, 1080, 426, 193, 1080, 640, 192, 1080, 853, 155, 1080, 1066, 135, 1080, 1280, 120, 1080, 1493, 138, 1080, 1706, 128, 1080, 1920, 70}; +// 100 elems +static float idwWeights[] = {0.04053167, -0.07571915, 0.0025228, 0.1974248, -0.14461569, 0.30541419, -0.06167972, -0.04958807, 0.00050641, -0.02023852, -0.05910183, -0.05512609, 0.37617088, -0.15904428, -0.10229038, -0.34355378, 0.39128499, 0.16502984, -0.00916588, -0.05264022, 0.01936275, 0.07156495, -0.35144818, -0.01955447, 0.07818357, 0.10850975, -0.34972095, -0.12186563, 0.04116577, 0.00378971, -0.05382689, 0.12480152, 0.0629668, 0.01070963, 0.03499341, 0.03218783, -0.00559502, -0.21214646, 0.34402987, -0.0246006, -0.01532894, 0.14944613, -0.12848858, -0.04539923, -0.1401344, -0.08143395, 0.18241941, 0.08353522, -0.20102434, 0.05168503, -0.05897654, 0.09669851, -0.13024191, -0.02125988, 0.19499928, 0.12311092, -0.05857943, -0.14954968, -0.19763496, 0.1172914, -0.04584096, 0.20130274, -0.198312, 0.04763513, -0.05524272, -0.07034198, 0.15155036, 0.0718368, -0.09612551, 0.09076872, -0.0863682, 0.08457486, 0.05520895, -0.10551672, 0.12577908, -0.05857014, 0.03529613, 0.01300365, -0.09736055, 0.06580921, -0.03074898, 0.1528181, -0.18295799, 0.18713497, -0.07368328, -0.06431868, 0.02485816, -0.02782657, -0.15182979, 0.06845911, -0.01974711, 0.23508139, -0.23124445, -0.1202987, 0.05380124, 0.04331648, 0.06285662, -0.01171756, 0.01128749, 0.06012665}; + +PERF_TEST(idw, reference) { + const int width = 1920; + const int height = 1080; + + Mat dst(height, width, CV_8U); + + PERF_SAMPLE_BEGIN() + idw_ref(NULL, dst.ptr(), height, width, idwPoints, idwWeights); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST(idw, halide) { + const int width = 1920; + const int height = 1080; + + Mat dst(height, width, CV_8U); + + PERF_SAMPLE_BEGIN() + idw_halide(NULL, dst.ptr(), height, width, idwPoints, idwWeights); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST(idw, halide_parallel) { + const int width = 1920; + const int height = 1080; + + Mat dst(height, width, CV_8U); + + PERF_SAMPLE_BEGIN() + idw_halide_parallel(NULL, dst.ptr(), height, width, idwPoints, idwWeights); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST(idw, halide_vec) { + const int width = 1920; + const int height = 1080; + + Mat dst(height, width, CV_8U); + + PERF_SAMPLE_BEGIN() + idw_halide_vec(NULL, dst.ptr(), height, width, idwPoints, idwWeights); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST(idw, halide_parallel_vec) { + const int width = 1920; + const int height = 1080; + + Mat dst(height, width, CV_8U); + + PERF_SAMPLE_BEGIN() + idw_halide_parallel_vec(NULL, dst.ptr(), height, width, idwPoints, idwWeights); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} + +PERF_TEST(voxel_up, halide) { + static const int ic = 4; + static const int height = 100; + static const int width = 100; + static const int batch = 72; + Mat src({width, height,ic, batch}, CV_32F); + Mat kernel({4, 4, 4, ic}, CV_32F); + Mat dst({width*2, height*2,ic, batch*2}, CV_32F); + randn(src, 0, 1); + randn(kernel, 0, 1); + + PERF_SAMPLE_BEGIN() + voxel_up(src.ptr(), kernel.ptr(), dst.ptr(), + ic, width, height, batch); + PERF_SAMPLE_END() + + SANITY_CHECK_NOTHING(); +} diff --git a/src/idw.cpp b/src/idw.cpp index d974449..c8e782a 100644 --- a/src/idw.cpp +++ b/src/idw.cpp @@ -1,124 +1,369 @@ -#include - -#include "algos.hpp" - -#include -#include - -using namespace std; - -#ifdef __riscv - #include - #include "idw.h" - using namespace Halide::Runtime; -#else - #include - using namespace Halide; -#endif - -static const int pointCount = 100; - -void idw_halide(const uint8_t* src, uint8_t* dst, int height, int width, int* pointsBuf, float* weightsBuf) { - float* maskBuf = new float[height * width](); - - Buffer mask(maskBuf, {width, height}); - Buffer output(dst, {width, height}); - - Buffer points(pointsBuf, {pointCount*3}); - Buffer weights(weightsBuf, {pointCount}); -#ifdef __riscv - idw(mask); -#else - static Func f("idw"); - - // try { - if (!f.defined()) { - Var x("x"), y("y"); - RDom r(0, pointCount); - - f(x, y) = 0.F; - Expr x0 = points(3*r+1); - Expr y0 = points(3*r); - Expr dx = x - x0; - Expr dy = y - y0; - Expr weight = weights(r); - f(x, y) += hypot(dx, dy) * weight; - - // f.vectorize(r, 8); - const int factor = 4; - f.update().atomic().vectorize(r, factor); - - // Compile - Target target; - target.os = Target::OS::Linux; - target.arch = Target::Arch::RISCV; - target.bits = 64; - target.vector_bits = factor * sizeof(float) * 8; - - // Tested XuanTie C906 has 128-bit vector unit - CV_Assert(target.vector_bits <= 128); - - std::vector features; - features.push_back(Target::RVV); - features.push_back(Target::NoAsserts); - features.push_back(Target::NoRuntime); - target.set_features(features); - - std::cout << target << std::endl; - f.print_loop_nest(); - - // Dump AOT code - f.compile_to_header("idw.h", {}, "idw", target); - f.compile_to_assembly("idw.s", {}, "idw", target); - } - // } - // catch (Halide::Error &e) { - // cout << e.what() << '\n'; - // } - - f.realize(mask); -#endif - - float maxVal = 193.0; - float minVal = 58.0; - float diff = maxVal - minVal; - for (int y = 0; y < height; y++) { - for (int x = 0; x < width; x++) { - dst[y*width + x] = (uint8_t) (255 * (maskBuf[y*width + x] - minVal) / diff); - } - } - delete[] maskBuf; -} - -void idw_ref(const uint8_t* src, uint8_t* dst, int height, int width, int* points, float* weights) { - float* mask = new float[height * width](); - - float maxVal = 193.0; - float minVal = 58.0; - - for (int y = 0; y < height; y++) { - for (int x = 0; x < width; x++) { - float dot = 0; - for (int i = 0; i < pointCount; i++) { - int x0 = points[3 * i + 1]; - int y0 = points[3 * i]; - int dx = x - x0; - int dy = y - y0; - - dot += sqrt(dx*dx + dy*dy) * weights[i]; - } - - mask[y*width + x] = dot; - } - } - - float diff = maxVal - minVal; - - for (int y = 0; y < height; y++) { - for (int x = 0; x < width; x++) { - dst[y*width + x] = (uint8_t) (255 * (mask[y*width + x] - minVal) / diff); - } - } - - delete[] mask; -} +#include + +#include "algos.hpp" + +#include +#include + +using namespace std; + +#ifdef __riscv + #include + #include "idw_halide.h" + #include "idw_halide_vec.h" + #include "idw_halide_parallel.h" + #include "idw_halide_parallel_vec.h" + using namespace Halide::Runtime; +#else + #include + using namespace Halide; +#endif + +static const int pointCount = 100; + +void idw_halide(const uint8_t* src, uint8_t* dst, int height, int width, int* pointsBuf, float* weightsBuf) { + float* maskBuf = new float[height * width](); + + Buffer mask(maskBuf, {width, height}); + Buffer output(dst, {width, height}); + + Buffer points(pointsBuf, {pointCount*3}); + Buffer weights(weightsBuf, {pointCount}); +#ifdef __riscv +<<<<<<< HEAD + idw_halide_(mask); +#else + static Func f("idw_halide_"); +======= + idw(mask); +#else + static Func f("idw"); +>>>>>>> 3b97268b3450a5058fcae17b4ec4a816f514fcb9 + + // try { + if (!f.defined()) { + Var x("x"), y("y"); + RDom r(0, pointCount); +<<<<<<< HEAD + f(x, y) = 0.F; + Expr x0 = points(3*r+1); + Expr y0 = points(3*r); + Expr dx = x - x0; + Expr dy = y - y0; + Expr weight = weights(r); + f(x, y) += hypot(dx, dy) * weight; + + // f.vectorize(r, 8); + const int factor = 4; + // f.update().atomic().vectorize(r, factor); + // f.update().parallel(x); + + // Compile + Target target; + target.os = Target::OS::Linux; + target.arch = Target::Arch::RISCV; + target.bits = 64; + target.vector_bits = factor * sizeof(float) * 8; + + // Tested XuanTie C906 has 128-bit vector unit + CV_Assert(target.vector_bits <= 128); + + std::vector features; + features.push_back(Target::RVV); + features.push_back(Target::NoAsserts); + features.push_back(Target::NoRuntime); + target.set_features(features); + + std::cout << target << std::endl; + // f.print_loop_nest(); + + // Dump AOT code + f.compile_to_header("idw_halide.h", {}, "idw_halide_", target); + f.compile_to_assembly("idw_halide.s", {}, "idw_halide_", target); + } + // } + // catch (Halide::Error &e) { + // cout << e.what() << '\n'; + // } + + f.realize(mask); +#endif + + float maxVal = 193.0; + float minVal = 58.0; + float diff = maxVal - minVal; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + dst[y*width + x] = (uint8_t) (255 * (maskBuf[y*width + x] - minVal) / diff); + } + } + delete[] maskBuf; +} + +void idw_halide_parallel(const uint8_t* src, uint8_t* dst, int height, int width, int* pointsBuf, float* weightsBuf) { + float* maskBuf = new float[height * width](); + + Buffer mask(maskBuf, {width, height}); + Buffer output(dst, {width, height}); + + Buffer points(pointsBuf, {pointCount*3}); + Buffer weights(weightsBuf, {pointCount}); +#ifdef __riscv + idw_halide_parallel_(mask); +#else + static Func f("idw_halide_parallel_"); + + try { + if (!f.defined()) { + Var x("x"), y("y"), x_outer("x_outer"), y_outer("y_outer"), x_inner("x_inner"), y_inner("y_inner"), tile_index("tile_index"); + RDom r(0, pointCount); + f(x, y) = 0.F; + Expr x0 = points(3*r+1); + Expr y0 = points(3*r); + Expr dx = x - x0; + Expr dy = y - y0; + Expr weight = weights(r); + f(x, y) += hypot(dx, dy) * weight; + + f.tile(x, y, x_outer, y_outer, x_inner, y_inner, width / 4, height / 4).fuse(x_outer, y_outer, tile_index).parallel(tile_index); + f.update().tile(x, y, x_outer, y_outer, x_inner, y_inner, width / 4, height / 4).fuse(x_outer, y_outer, tile_index).parallel(tile_index); + + const int factor = 4; + // f.update().atomic().vectorize(r, factor); + + // Compile + Target target; + target.os = Target::OS::Linux; + target.arch = Target::Arch::RISCV; + target.bits = 64; + target.vector_bits = factor * sizeof(float) * 8; + + // Tested XuanTie C906 has 128-bit vector unit + CV_Assert(target.vector_bits <= 128); + + std::vector features; + features.push_back(Target::RVV); + features.push_back(Target::NoAsserts); + features.push_back(Target::NoRuntime); + target.set_features(features); + + std::cout << target << std::endl; + // f.print_loop_nest(); + + // Dump AOT code + f.compile_to_header("idw_halide_parallel.h", {}, "idw_halide_parallel_", target); + f.compile_to_assembly("idw_halide_parallel.s", {}, "idw_halide_parallel_", target); + } + + f.realize(mask); + } + catch (Halide::Error &e) { + cout << e.what() << '\n'; + } +#endif + + float maxVal = 193.0; + float minVal = 58.0; + float diff = maxVal - minVal; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + dst[y*width + x] = (uint8_t) (255 * (maskBuf[y*width + x] - minVal) / diff); + } + } + delete[] maskBuf; +} + +void idw_halide_vec(const uint8_t* src, uint8_t* dst, int height, int width, int* pointsBuf, float* weightsBuf) { + float* maskBuf = new float[height * width](); + + Buffer mask(maskBuf, {width, height}); + Buffer output(dst, {width, height}); + + Buffer points(pointsBuf, {pointCount*3}); + Buffer weights(weightsBuf, {pointCount}); +#ifdef __riscv + idw_halide_vec_(mask); +#else + static Func f("idw_halide_vec_"); + + // try { + if (!f.defined()) { + Var x("x"), y("y"); + RDom r(0, pointCount); +======= + +>>>>>>> 3b97268b3450a5058fcae17b4ec4a816f514fcb9 + f(x, y) = 0.F; + Expr x0 = points(3*r+1); + Expr y0 = points(3*r); + Expr dx = x - x0; + Expr dy = y - y0; + Expr weight = weights(r); + f(x, y) += hypot(dx, dy) * weight; + + // f.vectorize(r, 8); + const int factor = 4; + f.update().atomic().vectorize(r, factor); +<<<<<<< HEAD + // f.update().parallel(x); +======= +>>>>>>> 3b97268b3450a5058fcae17b4ec4a816f514fcb9 + + // Compile + Target target; + target.os = Target::OS::Linux; + target.arch = Target::Arch::RISCV; + target.bits = 64; + target.vector_bits = factor * sizeof(float) * 8; + + // Tested XuanTie C906 has 128-bit vector unit + CV_Assert(target.vector_bits <= 128); + + std::vector features; + features.push_back(Target::RVV); + features.push_back(Target::NoAsserts); + features.push_back(Target::NoRuntime); + target.set_features(features); + + std::cout << target << std::endl; +<<<<<<< HEAD + // f.print_loop_nest(); + + // Dump AOT code + f.compile_to_header("idw_halide_vec.h", {}, "idw_halide_vec_", target); + f.compile_to_assembly("idw_halide_vec.s", {}, "idw_halide_vec_", target); +======= + f.print_loop_nest(); + + // Dump AOT code + f.compile_to_header("idw.h", {}, "idw", target); + f.compile_to_assembly("idw.s", {}, "idw", target); +>>>>>>> 3b97268b3450a5058fcae17b4ec4a816f514fcb9 + } + // } + // catch (Halide::Error &e) { + // cout << e.what() << '\n'; + // } + + f.realize(mask); +#endif + + float maxVal = 193.0; + float minVal = 58.0; + float diff = maxVal - minVal; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + dst[y*width + x] = (uint8_t) (255 * (maskBuf[y*width + x] - minVal) / diff); + } + } + delete[] maskBuf; +} + +<<<<<<< HEAD +void idw_halide_parallel_vec(const uint8_t* src, uint8_t* dst, int height, int width, int* pointsBuf, float* weightsBuf) { + float* maskBuf = new float[height * width](); + + Buffer mask(maskBuf, {width, height}); + Buffer output(dst, {width, height}); + + Buffer points(pointsBuf, {pointCount*3}); + Buffer weights(weightsBuf, {pointCount}); +#ifdef __riscv + idw_halide_parallel_vec_(mask); +#else + static Func f("idw_halide_parallel_vec_"); + + try { + if (!f.defined()) { + Var x("x"), y("y"), x_outer("x_outer"), y_outer("y_outer"), x_inner("x_inner"), y_inner("y_inner"), tile_index("tile_index"); + RDom r(0, pointCount); + f(x, y) = 0.F; + Expr x0 = points(3*r+1); + Expr y0 = points(3*r); + Expr dx = x - x0; + Expr dy = y - y0; + Expr weight = weights(r); + f(x, y) += hypot(dx, dy) * weight; + + f.tile(x, y, x_outer, y_outer, x_inner, y_inner, width / 4, height / 4).fuse(x_outer, y_outer, tile_index).parallel(tile_index); + f.update().tile(x, y, x_outer, y_outer, x_inner, y_inner, width / 4, height / 4).fuse(x_outer, y_outer, tile_index).parallel(tile_index); + + const int factor = 4; + f.update().atomic().vectorize(r, factor); + + // Compile + Target target; + target.os = Target::OS::Linux; + target.arch = Target::Arch::RISCV; + target.bits = 64; + target.vector_bits = factor * sizeof(float) * 8; + + // Tested XuanTie C906 has 128-bit vector unit + CV_Assert(target.vector_bits <= 128); + + std::vector features; + features.push_back(Target::RVV); + features.push_back(Target::NoAsserts); + features.push_back(Target::NoRuntime); + target.set_features(features); + + std::cout << target << std::endl; + // f.print_loop_nest(); + + // Dump AOT code + f.compile_to_header("idw_halide_parallel_vec.h", {}, "idw_halide_parallel_vec_", target); + f.compile_to_assembly("idw_halide_parallel_vec.s", {}, "idw_halide_parallel_vec_", target); + } + + f.realize(mask); + } + catch (Halide::Error &e) { + cout << e.what() << '\n'; + } +#endif + + float maxVal = 193.0; + float minVal = 58.0; + float diff = maxVal - minVal; + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + dst[y*width + x] = (uint8_t) (255 * (maskBuf[y*width + x] - minVal) / diff); + } + } + delete[] maskBuf; +} + +======= +>>>>>>> 3b97268b3450a5058fcae17b4ec4a816f514fcb9 +void idw_ref(const uint8_t* src, uint8_t* dst, int height, int width, int* points, float* weights) { + float* mask = new float[height * width](); + + float maxVal = 193.0; + float minVal = 58.0; + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + float dot = 0; + for (int i = 0; i < pointCount; i++) { + int x0 = points[3 * i + 1]; + int y0 = points[3 * i]; + int dx = x - x0; + int dy = y - y0; + + dot += sqrt(dx*dx + dy*dy) * weights[i]; + } + + mask[y*width + x] = dot; + } + } + + float diff = maxVal - minVal; + + for (int y = 0; y < height; y++) { + for (int x = 0; x < width; x++) { + dst[y*width + x] = (uint8_t) (255 * (mask[y*width + x] - minVal) / diff); + } + } + + delete[] mask; +} diff --git a/test/test_main.cpp b/test/test_main.cpp index 9fadfbd..4cb0dc8 100644 --- a/test/test_main.cpp +++ b/test/test_main.cpp @@ -1,188 +1,202 @@ -// Copyright (C), 2023, KNS Group LLC (YADRO) - -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "algos.hpp" - -using namespace cv; - -const int width = 1920; -const int height = 1080; - -static const int julia_width = 200; -static const int julia_height = 200; - -CV_TEST_MAIN("") - -TEST(julia, ref) { - Mat dst(julia_height, julia_width, CV_8UC1, Scalar(0)); - - julia_ref(dst.ptr(), dst.rows, dst.cols); - - imwrite("julia_ref.png", dst); -} - -TEST(julia, halide) { - Mat dst(julia_height, julia_width, CV_8UC1, Scalar(0)); - - halide_julia(dst.ptr(), dst.rows, dst.cols); - - imwrite("julia_halide.png", dst); -} - -TEST(histogram, opencv) { - Mat src(height, width, CV_8UC3), dst(3, 256, CV_32F), ref(3, 256, CV_32S); - randu(src, 0, 256); - - histogram_opencv(src, dst); - histogram_ref(src.ptr(), ref.ptr(), src.rows, src.cols); - - ref.convertTo(ref, CV_32F); - ASSERT_EQ(countNonZero(dst != ref), 0); -} - -TEST(histogram, halide) -{ - Mat src(height, width, CV_8UC3), dst(3, 256, CV_32S), ref(3, 256, CV_32S); - randu(src, 0, 256); - - histogram_halide(src.ptr(), dst.ptr(), src.rows, src.cols); - histogram_ref(src.ptr(), ref.ptr(), src.rows, src.cols); - - ASSERT_EQ(countNonZero(dst != ref), 0); -} - -TEST(bgr2gray, opencv) { - Mat src(height, width, CV_8UC3), dst(height, width, CV_8U), ref(height, width, CV_8U); - randu(src, 0, 256); - - bgr2gray_opencv(src, dst); - bgr2gray_ref(src.ptr(), ref.ptr(), src.rows, src.cols); - - ASSERT_LE(norm(ref, dst, NORM_INF), 1); -} - -TEST(bgr2gray_interleaved, halide) { - Mat src(height, width, CV_8UC3), dst(height, width, CV_8U), ref(height, width, CV_8U); - randu(src, 0, 256); - - bgr2gray_interleaved_halide(src.ptr(), dst.ptr(), src.rows, src.cols); - bgr2gray_ref(src.ptr(), ref.ptr(), src.rows, src.cols); - - ASSERT_LE(norm(ref, dst, NORM_INF), 0); -} - -TEST(bgr2gray_planar, halide) { - Mat src(height, width, CV_8UC3), dst(height, width, CV_8U), ref(height, width, CV_8U); - randu(src, 0, 256); - Mat planar(height * 3, width, CV_8U); - - std::vector channels(3); - channels[0] = planar.rowRange(0, height); - channels[1] = planar.rowRange(height, height * 2); - channels[2] = planar.rowRange(height * 2, height * 3); - cv::split(src, channels); - - bgr2gray_ref(src.ptr(), ref.ptr(), src.rows, src.cols); - bgr2gray_planar_halide(planar.ptr(), dst.ptr(), src.rows, src.cols); - - ASSERT_LE(norm(ref, dst, NORM_INF), 0); -} - -TEST(boxFilter, halide) { - Mat src(height, width, CV_8U), dst(height - 2, width - 2, CV_16U), ref(height, width, CV_8U); - randu(src, 0, 256); - Mat src16; - src.convertTo(src16, CV_16U); - - boxFilter_halide(src16.ptr(), dst.ptr(), src.rows, src.cols); - boxFilter_opencv(src, ref); - - ref = ref.rowRange(1, height - 1).colRange(1, width - 1); - ref.convertTo(ref, CV_16U); - ASSERT_LE(norm(ref, dst, NORM_INF), 1); -} - -#ifdef HAVE_OPENCV_DNN - -TEST(convolution_nchw, halide) { - static const int ic = 16; - static const int oc = 32; - static const int height = 128; - static const int width = 128; - Mat src({1, ic, height, width}, CV_32F); - Mat kernel({oc, ic, 3, 3}, CV_32F); - Mat ref({1, oc, height - 2, width - 2}, CV_32F); - Mat dst({1, oc, height - 2, width - 2}, CV_32F); - randn(src, 0, 1); - randn(kernel, 0, 1); - - convolution_opencv(src, kernel, ref, ic, oc); - convolution_nchw_halide(src.ptr(), kernel.ptr(), dst.ptr(), - ic, oc, height, width); - - ASSERT_LE(norm(ref, dst, NORM_INF), 4e-5f); -} - -TEST(convolution_nhwc, halide) { - static const int ic = 16; - static const int oc = 32; - static const int height = 128; - static const int width = 128; - Mat src_nchw({1, ic, height, width}, CV_32F); - Mat kernel_oihw({oc, ic, 3, 3}, CV_32F); - Mat ref({1, oc, height - 2, width - 2}, CV_32F); - Mat dst_nhwc({1, height - 2, width - 2, oc}, CV_32F); - randn(src_nchw, 0, 1); - randn(kernel_oihw, 0, 1); - - convolution_opencv(src_nchw, kernel_oihw, ref, ic, oc); - - Mat kernel_ihwo, src_nhwc; - cv::transpose(kernel_oihw.reshape(1, oc), kernel_ihwo); - cv::transpose(src_nchw.reshape(1, ic), src_nhwc); - - convolution_nhwc_halide(src_nhwc.ptr(), kernel_ihwo.ptr(), dst_nhwc.ptr(), - ic, oc, height, width); - - Mat dst; - cv::transpose(dst_nhwc.reshape(1, dst_nhwc.size[1] * dst_nhwc.size[2]), dst); - - ASSERT_LE(norm(ref.reshape(1, 1), dst.reshape(1, 1), NORM_INF), 4e-5f); -} - -#endif // HAVE_OPENCV_DNN - - -TEST(idw, halide) { - Mat src(height, width, CV_8U); - Mat dst(height, width, CV_8U), cl_dst(height, width, CV_8UC3), dst_h(height, width, CV_8U), cl_dst_h(height, width, CV_8UC3); - // randu(src, 0, 256); - - // 300 elems (100 points) - int points[] = {0, 0, 72, 0, 213, 79, 0, 426, 60, 0, 640, 76, 0, 853, 128, 0, 1066, 67, 0, 1280, 65, 0, 1493, 64, 0, 1706, 60, 0, 1920, 61, 120, 0, 81, 120, 213, 79, 120, 426, 58, 120, 640, 132, 120, 853, 149, 120, 1066, 142, 120, 1280, 64, 120, 1493, 69, 120, 1706, 65, 120, 1920, 64, 240, 0, 75, 240, 213, 68, 240, 426, 140, 240, 640, 153, 240, 853, 145, 240, 1066, 132, 240, 1280, 152, 240, 1493, 125, 240, 1706, 66, 240, 1920, 58, 360, 0, 78, 360, 213, 60, 360, 426, 139, 360, 640, 168, 360, 853, 154, 360, 1066, 138, 360, 1280, 145, 360, 1493, 160, 360, 1706, 68, 360, 1920, 60, 480, 0, 77, 480, 213, 59, 480, 426, 165, 480, 640, 183, 480, 853, 166, 480, 1066, 142, 480, 1280, 123, 480, 1493, 155, 480, 1706, 144, 480, 1920, 60, 600, 0, 83, 600, 213, 65, 600, 426, 178, 600, 640, 184, 600, 853, 138, 600, 1066, 124, 600, 1280, 132, 600, 1493, 175, 600, 1706, 175, 600, 1920, 60, 720, 0, 86, 720, 213, 60, 720, 426, 182, 720, 640, 179, 720, 853, 152, 720, 1066, 140, 720, 1280, 115, 720, 1493, 156, 720, 1706, 172, 720, 1920, 65, 840, 0, 90, 840, 213, 69, 840, 426, 164, 840, 640, 186, 840, 853, 146, 840, 1066, 147, 840, 1280, 122, 840, 1493, 152, 840, 1706, 166, 840, 1920, 69, 960, 0, 85, 960, 213, 67, 960, 426, 186, 960, 640, 169, 960, 853, 162, 960, 1066, 147, 960, 1280, 124, 960, 1493, 149, 960, 1706, 157, 960, 1920, 69, 1080, 0, 85, 1080, 213, 70, 1080, 426, 193, 1080, 640, 192, 1080, 853, 155, 1080, 1066, 135, 1080, 1280, 120, 1080, 1493, 138, 1080, 1706, 128, 1080, 1920, 70}; - // 100 elems - float weights[] = {0.04053167, -0.07571915, 0.0025228, 0.1974248, -0.14461569, 0.30541419, -0.06167972, -0.04958807, 0.00050641, -0.02023852, -0.05910183, -0.05512609, 0.37617088, -0.15904428, -0.10229038, -0.34355378, 0.39128499, 0.16502984, -0.00916588, -0.05264022, 0.01936275, 0.07156495, -0.35144818, -0.01955447, 0.07818357, 0.10850975, -0.34972095, -0.12186563, 0.04116577, 0.00378971, -0.05382689, 0.12480152, 0.0629668, 0.01070963, 0.03499341, 0.03218783, -0.00559502, -0.21214646, 0.34402987, -0.0246006, -0.01532894, 0.14944613, -0.12848858, -0.04539923, -0.1401344, -0.08143395, 0.18241941, 0.08353522, -0.20102434, 0.05168503, -0.05897654, 0.09669851, -0.13024191, -0.02125988, 0.19499928, 0.12311092, -0.05857943, -0.14954968, -0.19763496, 0.1172914, -0.04584096, 0.20130274, -0.198312, 0.04763513, -0.05524272, -0.07034198, 0.15155036, 0.0718368, -0.09612551, 0.09076872, -0.0863682, 0.08457486, 0.05520895, -0.10551672, 0.12577908, -0.05857014, 0.03529613, 0.01300365, -0.09736055, 0.06580921, -0.03074898, 0.1528181, -0.18295799, 0.18713497, -0.07368328, -0.06431868, 0.02485816, -0.02782657, -0.15182979, 0.06845911, -0.01974711, 0.23508139, -0.23124445, -0.1202987, 0.05380124, 0.04331648, 0.06285662, -0.01171756, 0.01128749, 0.06012665}; - - idw_ref(NULL, dst.ptr(), height, width, points, weights); - idw_halide(NULL, dst_h.ptr(), height, width, points, weights); - - applyColorMap(dst, cl_dst, COLORMAP_JET); - applyColorMap(dst_h, cl_dst_h, COLORMAP_JET); - - // imwrite("src.png", src); - imwrite("res.png", dst); - imwrite("cres.png", cl_dst); - imwrite("res_halide.png", dst_h); - imwrite("cres_halide.png", cl_dst_h); +// Copyright (C), 2023, KNS Group LLC (YADRO) + +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "algos.hpp" + +using namespace cv; + +const int width = 1920; +const int height = 1080; + +static const int julia_width = 200; +static const int julia_height = 200; + +CV_TEST_MAIN("") + +TEST(julia, ref) { + Mat dst(julia_height, julia_width, CV_8UC1, Scalar(0)); + + julia_ref(dst.ptr(), dst.rows, dst.cols); + + imwrite("julia_ref.png", dst); +} + +TEST(julia, halide) { + Mat dst(julia_height, julia_width, CV_8UC1, Scalar(0)); + + halide_julia(dst.ptr(), dst.rows, dst.cols); + + imwrite("julia_halide.png", dst); +} + +TEST(histogram, opencv) { + Mat src(height, width, CV_8UC3), dst(3, 256, CV_32F), ref(3, 256, CV_32S); + randu(src, 0, 256); + + histogram_opencv(src, dst); + histogram_ref(src.ptr(), ref.ptr(), src.rows, src.cols); + + ref.convertTo(ref, CV_32F); + ASSERT_EQ(countNonZero(dst != ref), 0); +} + +TEST(histogram, halide) +{ + Mat src(height, width, CV_8UC3), dst(3, 256, CV_32S), ref(3, 256, CV_32S); + randu(src, 0, 256); + + histogram_halide(src.ptr(), dst.ptr(), src.rows, src.cols); + histogram_ref(src.ptr(), ref.ptr(), src.rows, src.cols); + + ASSERT_EQ(countNonZero(dst != ref), 0); +} + +TEST(bgr2gray, opencv) { + Mat src(height, width, CV_8UC3), dst(height, width, CV_8U), ref(height, width, CV_8U); + randu(src, 0, 256); + + bgr2gray_opencv(src, dst); + bgr2gray_ref(src.ptr(), ref.ptr(), src.rows, src.cols); + + ASSERT_LE(norm(ref, dst, NORM_INF), 1); +} + +TEST(bgr2gray_interleaved, halide) { + Mat src(height, width, CV_8UC3), dst(height, width, CV_8U), ref(height, width, CV_8U); + randu(src, 0, 256); + + bgr2gray_interleaved_halide(src.ptr(), dst.ptr(), src.rows, src.cols); + bgr2gray_ref(src.ptr(), ref.ptr(), src.rows, src.cols); + + ASSERT_LE(norm(ref, dst, NORM_INF), 0); +} + +TEST(bgr2gray_planar, halide) { + Mat src(height, width, CV_8UC3), dst(height, width, CV_8U), ref(height, width, CV_8U); + randu(src, 0, 256); + Mat planar(height * 3, width, CV_8U); + + std::vector channels(3); + channels[0] = planar.rowRange(0, height); + channels[1] = planar.rowRange(height, height * 2); + channels[2] = planar.rowRange(height * 2, height * 3); + cv::split(src, channels); + + bgr2gray_ref(src.ptr(), ref.ptr(), src.rows, src.cols); + bgr2gray_planar_halide(planar.ptr(), dst.ptr(), src.rows, src.cols); + + ASSERT_LE(norm(ref, dst, NORM_INF), 0); +} + +TEST(boxFilter, halide) { + Mat src(height, width, CV_8U), dst(height - 2, width - 2, CV_16U), ref(height, width, CV_8U); + randu(src, 0, 256); + Mat src16; + src.convertTo(src16, CV_16U); + + boxFilter_halide(src16.ptr(), dst.ptr(), src.rows, src.cols); + boxFilter_opencv(src, ref); + + ref = ref.rowRange(1, height - 1).colRange(1, width - 1); + ref.convertTo(ref, CV_16U); + ASSERT_LE(norm(ref, dst, NORM_INF), 1); +} + +#ifdef HAVE_OPENCV_DNN + +TEST(convolution_nchw, halide) { + static const int ic = 16; + static const int oc = 32; + static const int height = 128; + static const int width = 128; + Mat src({1, ic, height, width}, CV_32F); + Mat kernel({oc, ic, 3, 3}, CV_32F); + Mat ref({1, oc, height - 2, width - 2}, CV_32F); + Mat dst({1, oc, height - 2, width - 2}, CV_32F); + randn(src, 0, 1); + randn(kernel, 0, 1); + + convolution_opencv(src, kernel, ref, ic, oc); + convolution_nchw_halide(src.ptr(), kernel.ptr(), dst.ptr(), + ic, oc, height, width); + + ASSERT_LE(norm(ref, dst, NORM_INF), 4e-5f); +} + +TEST(convolution_nhwc, halide) { + static const int ic = 16; + static const int oc = 32; + static const int height = 128; + static const int width = 128; + Mat src_nchw({1, ic, height, width}, CV_32F); + Mat kernel_oihw({oc, ic, 3, 3}, CV_32F); + Mat ref({1, oc, height - 2, width - 2}, CV_32F); + Mat dst_nhwc({1, height - 2, width - 2, oc}, CV_32F); + randn(src_nchw, 0, 1); + randn(kernel_oihw, 0, 1); + + convolution_opencv(src_nchw, kernel_oihw, ref, ic, oc); + + Mat kernel_ihwo, src_nhwc; + cv::transpose(kernel_oihw.reshape(1, oc), kernel_ihwo); + cv::transpose(src_nchw.reshape(1, ic), src_nhwc); + + convolution_nhwc_halide(src_nhwc.ptr(), kernel_ihwo.ptr(), dst_nhwc.ptr(), + ic, oc, height, width); + + Mat dst; + cv::transpose(dst_nhwc.reshape(1, dst_nhwc.size[1] * dst_nhwc.size[2]), dst); + + ASSERT_LE(norm(ref.reshape(1, 1), dst.reshape(1, 1), NORM_INF), 4e-5f); +} + +#endif // HAVE_OPENCV_DNN + + +TEST(idw, halide) { + Mat src(height, width, CV_8U); + Mat dst(height, width, CV_8U), cl_dst(height, width, CV_8UC3), dst_h(height, width, CV_8U), cl_dst_h(height, width, CV_8UC3); + Mat dst_hp(height, width, CV_8U), cl_dst_hp(height, width, CV_8UC3), dst_hv(height, width, CV_8U), cl_dst_hv(height, width, CV_8UC3), + dst_hpv(height, width, CV_8U), cl_dst_hpv(height, width, CV_8UC3); + // randu(src, 0, 256); + + // 300 elems (100 points) + int points[] = {0, 0, 72, 0, 213, 79, 0, 426, 60, 0, 640, 76, 0, 853, 128, 0, 1066, 67, 0, 1280, 65, 0, 1493, 64, 0, 1706, 60, 0, 1920, 61, 120, 0, 81, 120, 213, 79, 120, 426, 58, 120, 640, 132, 120, 853, 149, 120, 1066, 142, 120, 1280, 64, 120, 1493, 69, 120, 1706, 65, 120, 1920, 64, 240, 0, 75, 240, 213, 68, 240, 426, 140, 240, 640, 153, 240, 853, 145, 240, 1066, 132, 240, 1280, 152, 240, 1493, 125, 240, 1706, 66, 240, 1920, 58, 360, 0, 78, 360, 213, 60, 360, 426, 139, 360, 640, 168, 360, 853, 154, 360, 1066, 138, 360, 1280, 145, 360, 1493, 160, 360, 1706, 68, 360, 1920, 60, 480, 0, 77, 480, 213, 59, 480, 426, 165, 480, 640, 183, 480, 853, 166, 480, 1066, 142, 480, 1280, 123, 480, 1493, 155, 480, 1706, 144, 480, 1920, 60, 600, 0, 83, 600, 213, 65, 600, 426, 178, 600, 640, 184, 600, 853, 138, 600, 1066, 124, 600, 1280, 132, 600, 1493, 175, 600, 1706, 175, 600, 1920, 60, 720, 0, 86, 720, 213, 60, 720, 426, 182, 720, 640, 179, 720, 853, 152, 720, 1066, 140, 720, 1280, 115, 720, 1493, 156, 720, 1706, 172, 720, 1920, 65, 840, 0, 90, 840, 213, 69, 840, 426, 164, 840, 640, 186, 840, 853, 146, 840, 1066, 147, 840, 1280, 122, 840, 1493, 152, 840, 1706, 166, 840, 1920, 69, 960, 0, 85, 960, 213, 67, 960, 426, 186, 960, 640, 169, 960, 853, 162, 960, 1066, 147, 960, 1280, 124, 960, 1493, 149, 960, 1706, 157, 960, 1920, 69, 1080, 0, 85, 1080, 213, 70, 1080, 426, 193, 1080, 640, 192, 1080, 853, 155, 1080, 1066, 135, 1080, 1280, 120, 1080, 1493, 138, 1080, 1706, 128, 1080, 1920, 70}; + // 100 elems + float weights[] = {0.04053167, -0.07571915, 0.0025228, 0.1974248, -0.14461569, 0.30541419, -0.06167972, -0.04958807, 0.00050641, -0.02023852, -0.05910183, -0.05512609, 0.37617088, -0.15904428, -0.10229038, -0.34355378, 0.39128499, 0.16502984, -0.00916588, -0.05264022, 0.01936275, 0.07156495, -0.35144818, -0.01955447, 0.07818357, 0.10850975, -0.34972095, -0.12186563, 0.04116577, 0.00378971, -0.05382689, 0.12480152, 0.0629668, 0.01070963, 0.03499341, 0.03218783, -0.00559502, -0.21214646, 0.34402987, -0.0246006, -0.01532894, 0.14944613, -0.12848858, -0.04539923, -0.1401344, -0.08143395, 0.18241941, 0.08353522, -0.20102434, 0.05168503, -0.05897654, 0.09669851, -0.13024191, -0.02125988, 0.19499928, 0.12311092, -0.05857943, -0.14954968, -0.19763496, 0.1172914, -0.04584096, 0.20130274, -0.198312, 0.04763513, -0.05524272, -0.07034198, 0.15155036, 0.0718368, -0.09612551, 0.09076872, -0.0863682, 0.08457486, 0.05520895, -0.10551672, 0.12577908, -0.05857014, 0.03529613, 0.01300365, -0.09736055, 0.06580921, -0.03074898, 0.1528181, -0.18295799, 0.18713497, -0.07368328, -0.06431868, 0.02485816, -0.02782657, -0.15182979, 0.06845911, -0.01974711, 0.23508139, -0.23124445, -0.1202987, 0.05380124, 0.04331648, 0.06285662, -0.01171756, 0.01128749, 0.06012665}; + + idw_ref(NULL, dst.ptr(), height, width, points, weights); + idw_halide(NULL, dst_h.ptr(), height, width, points, weights); + idw_halide_parallel(NULL, dst_hp.ptr(), height, width, points, weights); + idw_halide_vec(NULL, dst_hv.ptr(), height, width, points, weights); + idw_halide_parallel_vec(NULL, dst_hpv.ptr(), height, width, points, weights); + + applyColorMap(dst, cl_dst, COLORMAP_JET); + applyColorMap(dst_h, cl_dst_h, COLORMAP_JET); + applyColorMap(dst_hp, cl_dst_hp, COLORMAP_JET); + applyColorMap(dst_hv, cl_dst_hv, COLORMAP_JET); + applyColorMap(dst_hpv, cl_dst_hpv, COLORMAP_JET); + + // imwrite("src.png", src); + imwrite("res.png", dst); + imwrite("cres.png", cl_dst); + imwrite("res_halide.png", dst_h); + imwrite("cres_halide.png", cl_dst_h); + imwrite("res_halide_par.png", dst_hp); + imwrite("cres_halide_par.png", cl_dst_hp); + imwrite("res_halide_vec.png", dst_hv); + imwrite("cres_halide_vec.png", cl_dst_hv); + imwrite("res_halide_parvec.png", dst_hpv); + imwrite("cres_halide_parvec.png", cl_dst_hpv); } \ No newline at end of file