Comments (5)
JuliaSIMD/LoopVectorization.jl@6d7b07e
Fix:
JuliaSIMD/LoopVectorization.jl@20386cc
There was a +
where there should have been a -
.
from tullio.jl.
Here's a Tullio-free reproducer, obtained by simplifying what this prints out: @tullio s_loopvec = A[a,b,a,b] avx=true grad=false verbose=2 threads=false
julia> A = ones(2, 2, 2, 2); # also wrong for e.g. A = ones(8,10,8,10)
julia> s_for = sum(A[a,b,a,b] for a in axes(A,1), b in axes(A,2))
4.0
julia> using LoopVectorization
julia> function sum169(A, πΆπa=axes(A,1), πΆπb=axes(A,2))
ππΈπΈ = 0.0
LoopVectorization.@avx unroll = 0 for b in πΆπb
for a = πΆπa
ππΈπΈ = ππΈπΈ + A[a, b, a, b]
end
end
ππΈπΈ
end
sum169 (generic function with 3 methods)
julia> sum169(A)
4.677771489e-314
from tullio.jl.
If anyone wants to look into this, start in this file:
https://github.com/JuliaSIMD/LoopVectorization.jl/blob/ded74ba4229c5bdb286057204aa50a8c8895f472/src/parse/memory_ops_common.jl#L542
Look at what's happening to A[a, b, a, b]
. LV will try to reshape it into a 2-d array, B[a, b]
, where stride(B,1) == stride(A,1) + stride(A,3)
and stride(B,2) == stride(A,2) + stride(A,4)
.
But something seems to be off there. Looks like it is reading junk data.
Check:
- It is still 8-byte aligned? I.e., something going wrong when dealing with the offset indices (LV generates 0-based index code, and thus must transform).
- Is it reading out of bounds?
from tullio.jl.
You probably just need to look at the @macroexpand
, since this is probably going wrong in the preprocessing:
julia> @macroexpand LoopVectorization.@avx unroll = 0 for b in πΆπb
for a = πΆπa
ππΈπΈ = ππΈπΈ + A[a, b, a, b]
end
end
quote
begin
#= REPL[15]:5 =#
nothing
end
begin
#= REPL[15]:4 =#
nothing
end
begin
#= REPL[15]:3 =#
nothing
end
begin
#= REPL[15]:2 =#
nothing
end
begin
#= REPL[15]:1 =#
nothing
end
var"###looprangeb###1###" = LoopVectorization.canonicalize_range(begin
$(Expr(:inbounds, true))
local var"#68#val" = πΆπb
$(Expr(:inbounds, :pop))
var"#68#val"
end)
var"###looplenb###2###" = StaticArrayInterface.static_length(var"###looprangeb###1###")
var"###b_loop_lower_bound###3###" = LoopVectorization.maybestaticfirst(var"###looprangeb###1###")
var"###b_loop_upper_bound###4###" = LoopVectorization.maybestaticlast(var"###looprangeb###1###")
var"###b_loop_step###5###" = LoopVectorization.static_step(var"###looprangeb###1###")
var"###looprangea###6###" = LoopVectorization.canonicalize_range(begin
$(Expr(:inbounds, true))
local var"#69#val" = πΆπa
$(Expr(:inbounds, :pop))
var"#69#val"
end)
var"###looplena###7###" = StaticArrayInterface.static_length(var"###looprangea###6###")
var"###a_loop_lower_bound###8###" = LoopVectorization.maybestaticfirst(var"###looprangea###6###")
var"###a_loop_upper_bound###9###" = LoopVectorization.maybestaticlast(var"###looprangea###6###")
var"###a_loop_step###10###" = LoopVectorization.static_step(var"###looprangea###6###")
if LoopVectorization.check_args(A, typeof(ππΈπΈ)) && (true && $(Expr(:&&, :((LoopVectorization.can_turbo)(LoopVectorization.add_fast, Val{2}())))))
(var"##vptr##_A", var"#A#preserve#buffer#") = LoopVectorization.stridedpointer_preserve(A)
var"##vptr##_A##ind##1##repeated##3##" = LoopVectorization.VectorizationBase.double_index(var"##vptr##_A", Val{0}(), Val{2}())
var"##vptr##_A##ind##1##repeated##3####ind##3##repeated##3##" = LoopVectorization.VectorizationBase.double_index(var"##vptr##_A##ind##1##repeated##3##", Val{2}(), Val{2}())
var"####grouped#strided#pointer####12###" = (getfield)(LoopVectorization.grouped_strided_pointer((LoopVectorization.densewrapper(LoopVectorization.gespf1(var"##vptr##_A##ind##1##repeated##3####ind##3##repeated##3##", (var"###a_loop_lower_bound###8###", var"###b_loop_lower_bound###3###")), A),), Val{()}()), 1)
begin
$(Expr(:gc_preserve, :(var"##ππΈπΈ_##onevec##" = begin
var"##vargsym#257" = ((LoopVectorization.zerorangestart(var"###looprangeb###1###"), LoopVectorization.zerorangestart(var"###looprangea###6###")), (var"####grouped#strided#pointer####12###", Base.eltype(ππΈπΈ)))
var"##Tloopeltype##" = LoopVectorization.promote_type(LoopVectorization.eltype(A), Base.eltype(ππΈπΈ))
var"##Wvecwidth##" = LoopVectorization.pick_vector_width(var"##Tloopeltype##")
LoopVectorization._turbo_!(LoopVectorization.avx_config_val(Val{(false, 0, 0, 0, false, 0x0000000000000001, 1, true)}(), var"##Wvecwidth##"), Val{(Symbol("##DROPPED#CONSTANT##"), Symbol("##DROPPED#CONSTANT##"), LoopVectorization.OperationStruct(0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000021, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.constant, 0x0001, 0x0000), :LoopVectorization, :getindex, LoopVectorization.OperationStruct(0x00000000000000000000000000000021, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.memload, 0x0002, 0x0001), :LoopVectorization, :add_fast, LoopVectorization.OperationStruct(0x00000000000000000000000000000021, 0x00000000000000000000000000000021, 0x00000000000000000000000000000000, 0x00000000000000000000000000010002, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, 0x00000000000000000000000000000000, LoopVectorization.compute, 0x0001, 0x0000))}(), Val{(LoopVectorization.ArrayRefStruct{:A, Symbol("##vptr##_A##ind##1##repeated##3####ind##3##repeated##3##")}(0x00000000000000000000000000000101, 0x00000000000000000000000000000201, 0x00000000000000000000000000000000, 0x00000000000000000000000000000101),)}(), Val{(0, (3,), (1,), (), (), (), ())}(), Val{(:b, :a)}(), Base.Val(Base.typeof(var"##vargsym#257")), LoopVectorization.flatten_to_tuple(var"##vargsym#257")...)
end), Symbol("#A#preserve#buffer#")))
In particular above, see
(var"##vptr##_A", var"#A#preserve#buffer#") = LoopVectorization.stridedpointer_preserve(A)
var"##vptr##_A##ind##1##repeated##3##" = LoopVectorization.VectorizationBase.double_index(var"##vptr##_A", Val{0}(), Val{2}())
var"##vptr##_A##ind##1##repeated##3####ind##3##repeated##3##" = LoopVectorization.VectorizationBase.double_index(var"##vptr##_A##ind##1##repeated##3##", Val{2}(), Val{2}())
which you can easily run in the REPL.
Or, just looking at the arguments:
it is passing Val{0}()
, Val{2}()
the first time.
That looks correct, w/ 0-based indexing, that means it is collapsing the two a
indicies.
The next one is Val{2}()
, Val{2}()
. That is obviously wrong. It should probably actually be Val{0}()
and Val{2}()
again.
from tullio.jl.
Thanks!
from tullio.jl.
Related Issues (20)
- Reporting a bug when Tullio being included with LoopVectorization HOT 1
- [Question] Is it possible to create a vector of SVectors from a Matrix using Tullio? HOT 2
- [Question] How to change summation order? HOT 5
- Use package extensions HOT 1
- How finalizers `|>` work HOT 5
- Method error when broadcast and sum of matrices HOT 1
- GPU Kernel Compilation Failed with Interpolations HOT 2
- Upgrade to CUDA.CUDAKernels HOT 9
- Add Finch.jl backend HOT 4
- CUDA v4 support HOT 2
- Using threads, vs setting threads=false gives different result HOT 3
- Issue with vectorized functions on GPU HOT 3
- Error when specifying the range of an index with a UnitRange HOT 4
- Scalar indexing with CUDA HOT 10
- Please update dep of FillArrays to v1.
- Bad interaction with Enzyme? HOT 6
- Zygote with Tullio gives wrong gradients/pullbacks using CUDA HOT 1
- Use EllipsisNotation ? HOT 2
- Error when using Loopvectorization
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
π Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. πππ
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google β€οΈ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from tullio.jl.