[ Info: loading pretrain bert model: uncased_L-12_H-768_A-12.tfbson wordpiece
[ Info: loading pretrain bert model: uncased_L-12_H-768_A-12.tfbson tokenizer
[ Info: loading pretrain bert model: uncased_L-12_H-768_A-12.tfbson bert_model
[ Info: start training
[ Info: epoch: 1
ERROR: LoadError: GPU compilation of kernel #broadcast_kernel#15(CUDA.CuKernelContext, CUDA.CuDeviceArray{Float32, 4, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{4}, NTuple{4, Base.OneTo{Int64}}, typeof(+), Tuple{Base.Broadcast.Extruded{CUDA.CuDeviceArray{Float32, 4, 1}, NTuple{4, Bool}, NTuple{4, Int64}}, Base.Broadcast.Extruded{Array{Float32, 4}, NTuple{4, Bool}, NTuple{4, Int64}}}}, Int64) failed
KernelError: passing and using non-bitstype argument
Argument 4 to your kernel function is of type Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{4}, NTuple{4, Base.OneTo{Int64}}, typeof(+), Tuple{Base.Broadcast.Extruded{CUDA.CuDeviceArray{Float32, 4, 1}, NTuple{4, Bool}, NTuple{4, Int64}}, Base.Broadcast.Extruded{Array{Float32, 4}, NTuple{4, Bool}, NTuple{4, Int64}}}}, which is not isbits:
.args is of type Tuple{Base.Broadcast.Extruded{CUDA.CuDeviceArray{Float32, 4, 1}, NTuple{4, Bool}, NTuple{4, Int64}}, Base.Broadcast.Extruded{Array{Float32, 4}, NTuple{4, Bool}, NTuple{4, Int64}}} which is not isbits.
.2 is of type Base.Broadcast.Extruded{Array{Float32, 4}, NTuple{4, Bool}, NTuple{4, Int64}} which is not isbits.
.x is of type Array{Float32, 4} which is not isbits.
Stacktrace:
[1] check_invocation(job::GPUCompiler.CompilerJob)
@ GPUCompiler C:\Users\jackn\.julia\packages\GPUCompiler\iaKrd\src\validation.jl:86
[2] macro expansion
@ C:\Users\jackn\.julia\packages\GPUCompiler\iaKrd\src\driver.jl:413 [inlined]
[3] macro expansion
@ C:\Users\jackn\.julia\packages\TimerOutputs\jgSVI\src\TimerOutput.jl:252 [inlined]
[4] macro expansion
@ C:\Users\jackn\.julia\packages\GPUCompiler\iaKrd\src\driver.jl:412 [inlined]
[5] emit_asm(job::GPUCompiler.CompilerJob, ir::LLVM.Module; strip::Bool, validate::Bool, format::LLVM.API.LLVMCodeGenFileType)
@ GPUCompiler C:\Users\jackn\.julia\packages\GPUCompiler\iaKrd\src\utils.jl:64
[6] cufunction_compile(job::GPUCompiler.CompilerJob, ctx::LLVM.Context)
@ CUDA C:\Users\jackn\.julia\packages\CUDA\tTK8Y\src\compiler\execution.jl:354
[7] #224
@ C:\Users\jackn\.julia\packages\CUDA\tTK8Y\src\compiler\execution.jl:347 [inlined]
[8] JuliaContext(f::CUDA.var"#224#225"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{GPUArrays.var"#broadcast_kernel#15", Tuple{CUDA.CuKernelContext, CUDA.CuDeviceArray{Float32, 4, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{4}, NTuple{4, Base.OneTo{Int64}}, typeof(+), Tuple{Base.Broadcast.Extruded{CUDA.CuDeviceArray{Float32, 4, 1}, NTuple{4, Bool}, NTuple{4, Int64}}, Base.Broadcast.Extruded{Array{Float32, 4}, NTuple{4, Bool}, NTuple{4, Int64}}}}, Int64}}}})
@ GPUCompiler C:\Users\jackn\.julia\packages\GPUCompiler\iaKrd\src\driver.jl:74
[9] cufunction_compile(job::GPUCompiler.CompilerJob)
@ CUDA C:\Users\jackn\.julia\packages\CUDA\tTK8Y\src\compiler\execution.jl:346
[10] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
@ GPUCompiler C:\Users\jackn\.julia\packages\GPUCompiler\iaKrd\src\cache.jl:90
[11] cufunction(f::GPUArrays.var"#broadcast_kernel#15", tt::Type{Tuple{CUDA.CuKernelContext, CUDA.CuDeviceArray{Float32, 4, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{4}, NTuple{4, Base.OneTo{Int64}}, typeof(+), Tuple{Base.Broadcast.Extruded{CUDA.CuDeviceArray{Float32, 4, 1}, NTuple{4, Bool}, NTuple{4, Int64}}, Base.Broadcast.Extruded{Array{Float32, 4}, NTuple{4, Bool}, NTuple{4, Int64}}}}, Int64}}; name::Nothing, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
@ CUDA C:\Users\jackn\.julia\packages\CUDA\tTK8Y\src\compiler\execution.jl:299
[12] cufunction
@ C:\Users\jackn\.julia\packages\CUDA\tTK8Y\src\compiler\execution.jl:293 [inlined]
[13] macro expansion
@ C:\Users\jackn\.julia\packages\CUDA\tTK8Y\src\compiler\execution.jl:102 [inlined]
[14] #launch_heuristic#248
@ C:\Users\jackn\.julia\packages\CUDA\tTK8Y\src\gpuarrays.jl:17 [inlined]
[15] _copyto!
@ C:\Users\jackn\.julia\packages\GPUArrays\EVTem\src\host\broadcast.jl:73 [inlined]
[16] copyto!
@ C:\Users\jackn\.julia\packages\GPUArrays\EVTem\src\host\broadcast.jl:56 [inlined]
[17] copy
@ C:\Users\jackn\.julia\packages\GPUArrays\EVTem\src\host\broadcast.jl:47 [inlined]
[18] materialize
@ .\broadcast.jl:860 [inlined]
[19] apply_mask(score::CUDA.CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, mask::Array{Float32, 3})
@ Transformers.Basic C:\Users\jackn\.julia\packages\Transformers\K1F88\src\basic\mh_atten.jl:182
[20] apply_mask
@ C:\Users\jackn\.julia\packages\Transformers\K1F88\src\basic\mh_atten.jl:188 [inlined]
[21] attention(query::CUDA.CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, key::CUDA.CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, value::CUDA.CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, mask::Array{Float32, 3}, future::Bool, dropout::Dropout{Float64, Colon, CUDA.RNG})
@ Transformers.Basic C:\Users\jackn\.julia\packages\Transformers\K1F88\src\basic\mh_atten.jl:204
[22] (::Transformers.Basic.MultiheadAttention{Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dropout{Float64, Colon, CUDA.RNG}})(query::CUDA.CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, key::CUDA.CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, value::CUDA.CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}; mask::Array{Float32, 3})
@ Transformers.Basic C:\Users\jackn\.julia\packages\Transformers\K1F88\src\basic\mh_atten.jl:102
[23] (::Transformer{Transformers.Basic.MultiheadAttention{Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dropout{Float64, Colon, CUDA.RNG}}, LayerNorm{typeof(identity), Flux.Scale{typeof(identity), CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Float32, 1}, Transformers.Basic.PwFFN{Dense{typeof(gelu), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, LayerNorm{typeof(identity), Flux.Scale{typeof(identity), CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Float32, 1}, Dropout{Float64, Colon, CUDA.RNG}})(x::CUDA.CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, mask::Array{Float32, 3})
@ Transformers.Basic C:\Users\jackn\.julia\packages\Transformers\K1F88\src\basic\transformer.jl:69
[24] macro expansion
@ C:\Users\jackn\.julia\packages\Transformers\K1F88\src\stacks\stack.jl:0 [inlined]
[25] (::Stack{Symbol("((x, m) => x':(x, m)) => 12"), NTuple{12, Transformer{Transformers.Basic.MultiheadAttention{Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dropout{Float64, Colon, CUDA.RNG}}, LayerNorm{typeof(identity), Flux.Scale{typeof(identity), CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Float32, 1}, Transformers.Basic.PwFFN{Dense{typeof(gelu), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, LayerNorm{typeof(identity), Flux.Scale{typeof(identity), CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Float32, 1}, Dropout{Float64, Colon, CUDA.RNG}}}})(::CUDA.CuArray{Float32, 3,
CUDA.Mem.DeviceBuffer}, ::Array{Float32, 3})
@ Transformers.Stacks C:\Users\jackn\.julia\packages\Transformers\K1F88\src\stacks\stack.jl:19
[26] (::Bert{Stack{Symbol("((x, m) => x':(x, m)) => 12"), NTuple{12, Transformer{Transformers.Basic.MultiheadAttention{Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2,
CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32,
1, CUDA.Mem.DeviceBuffer}}, Dropout{Float64, Colon, CUDA.RNG}}, LayerNorm{typeof(identity), Flux.Scale{typeof(identity), CUDA.CuArray{Float32, 1,
CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Float32, 1}, Transformers.Basic.PwFFN{Dense{typeof(gelu), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, LayerNorm{typeof(identity), Flux.Scale{typeof(identity), CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Float32, 1}, Dropout{Float64, Colon, CUDA.RNG}}}}, Dropout{Float64, Colon, CUDA.RNG}})(x::CUDA.CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, mask::Array{Float32, 3}; all::Bool)
@ Transformers.BidirectionalEncoder C:\Users\jackn\.julia\packages\Transformers\K1F88\src\bert\bert.jl:55
[27] Bert
@ C:\Users\jackn\.julia\packages\Transformers\K1F88\src\bert\bert.jl:50 [inlined]
[28] loss(data::NamedTuple{(:tok, :segment), Tuple{Matrix{Int64}, Matrix{Int64}}}, ind::Vector{Tuple{Int64, Int64}}, masklabel::Flux.OneHotArray{UInt32, 30522, 1, 2, Vector{UInt32}}, nextlabel::Flux.OneHotArray{UInt32, 2, 1, 2, Vector{UInt32}}, mask::Array{Float32, 3})