chengchingwen / transformers.jl Goto Github PK

View Code? Open in Web Editor NEW

503.0 15.0 69.0 2.58 MB

Julia Implementation of Transformer models

License: MIT License

Julia 94.34% Jupyter Notebook 5.66%

transformer natural-language-processing deep-learning attention nlp flux machine-learning

transformers.jl's Introduction

Julia implementation of transformer-based models, with Flux.jl.

notice: The current version is almost completely different from the 0.1.x version. If you are using the old version, make sure to update the changes or stick to the old version.

Installation

In the Julia REPL:

]add Transformers

Example

Using pretrained Bert with Transformers.jl.

using Transformers
using Transformers.TextEncoders
using Transformers.HuggingFace

textencoder, bert_model = hgf"bert-base-uncased"

text1 = "Peter Piper picked a peck of pickled peppers"
text2 = "Fuzzy Wuzzy was a bear"

text = [[ text1, text2 ]] # 1 batch of contiguous sentences
sample = encode(textencoder, text) # tokenize + pre-process (add special tokens + truncate / padding + one-hot encode)

@assert reshape(decode(textencoder, sample.token), :) == [
    "[CLS]", "peter", "piper", "picked", "a", "peck", "of", "pick", "##led", "peppers", "[SEP]",
    "fuzzy", "wu", "##zzy",  "was", "a", "bear", "[SEP]"
]

bert_features = bert_model(sample).hidden_state

See example folder for the complete example.

For more information

If you want to know more about this package, see the document and read code in the example folder. You can also tag me (@chengchingwen) on Julia's slack or discourse if you have any questions, or just create a new Issue on GitHub.

transformers.jl's People

Contributors

Stargazers

Watchers

transformers.jl's Issues

ERROR : downloading on m1 1.7 beta 3

I keep getting this error:
[ Info: loading pretrain bert model: scibert_basevocab_uncased.tfbson
ERROR: type DataType has no field mutable

no matter what model type I pick.

Flux.onehot freezes when used with Vocabulary, or throws StackOverflow

Here is the stack

(@v1.6) pkg> status
      Status `~/.julia/environments/v1.6/Project.toml`
  [69666777] Arrow v1.5.0
  [c52e3926] Atom v0.12.32
  [fbb218c0] BSON v0.3.3
  [336ed68f] CSV v0.8.5
  [052768ef] CUDA v3.2.1
  [a93c6f00] DataFrames v1.1.1
  [587475ba] Flux v0.12.4
  [e5e0dc1b] Juno v0.8.4
  [6f286f6a] MultivariateStats v0.8.0
  [91a5bcdd] Plots v1.15.2
  [2913bbd2] StatsBase v0.33.8
  [21ca0261] Transformers v0.1.12
  [e88e6eb3] Zygote v0.6.11
  [9a3f8284] Random
julia> Flux.onehot(vocab,[1])
ERROR: StackOverflowError:
Stacktrace:
  [1] findnext(testf::Base.Fix2{typeof(isequal), Int64}, A::Vector{Any}, start::Int64)
    @ Base ./array.jl:1855
  [2] findfirst
    @ ./array.jl:1906 [inlined]
  [3] encode(vocab::Vocabulary{Any}, i::Int64)
    @ Transformers.Basic ~/.julia/packages/Transformers/rCnGb/src/basic/embeds/vocab.jl:32
  [4] (::Transformers.Basic.var"#13#14"{Vocabulary{Any}})(x::Int64)
    @ Transformers.Basic ~/.julia/packages/Transformers/rCnGb/src/basic/embeds/vocab.jl:34
  [5] iterate
    @ ./generator.jl:47 [inlined]
  [6] _collect(c::Vector{Int64}, itr::Base.Generator{Vector{Int64}, Transformers.Basic.var"#13#14"{Vocabulary{Any}}}, #unused#::Base.EltypeUnknown, isz::Base.HasShape{1})
    @ Base ./array.jl:691
  [7] collect_similar
    @ ./array.jl:606 [inlined]
  [8] map
    @ ./abstractarray.jl:2294 [inlined]
  [9] encode(vocab::Vocabulary{Any}, xs::Vector{Int64})
    @ Transformers.Basic ~/.julia/packages/Transformers/rCnGb/src/basic/embeds/vocab.jl:34
 [10] (::Vocabulary{Any})(x::Vector{Int64})
    @ Transformers.Basic ~/.julia/packages/Transformers/rCnGb/src/basic/embeds/vocab.jl:52
 [11] onehot(v::Vocabulary{Any}, x::Vector{Int64}) (repeats 79973 times)
    @ Transformers.Basic ~/.julia/packages/Transformers/rCnGb/src/basic/embeds/vocab.jl:83

Type unstable functions

Forward step of Transformer is type unstable. Running the example from the docs

using Transfomers

m = Transformer(512, 8, 64, 2048) #define a Transformer block with 8 head and 64 neuron for each head
x = randn(512, 30, 3) #fake data of length 30

y = m(x)

and checking for @code_warntype produces:

julia> @code_warntype m(x,nothing)
Variables
  t::Transformer
  x::Array{Float64,3}
  mask::Core.Compiler.Const(nothing, false)
  a::Any
  insize::Any
  res_a::Any
  pwffn::AbstractArray{T,2} where T
  res_pwffn::Any

Body::Any
1 ─       Core.NewvarNode(:(insize))
│         Core.NewvarNode(:(pwffn))
│         Core.NewvarNode(:(res_pwffn))
│   %4  = (:mask,)::Core.Compiler.Const((:mask,), false)
│   %5  = Core.apply_type(Core.NamedTuple, %4)::Core.Compiler.Const(NamedTuple{(:mask,),T} where T<:Tuple, false)
│   %6  = Core.tuple(mask)::Core.Compiler.Const((nothing,), false)
│   %7  = (%5)(%6)::Core.Compiler.Const((mask = nothing,), false)
│   %8  = Base.getproperty(t, :mh)::Transformers.Basic.MultiheadAttention
│   %9  = Core.kwfunc(%8)::Core.Compiler.Const(Core.var"#Any##kw"(), false)
│   %10 = Base.getproperty(t, :mh)::Transformers.Basic.MultiheadAttention
│         (a = (%9)(%7, %10, x, x, x))
│   %12 = Base.getproperty(t, :drop)::Flux.Dropout
│         (a = (%12)(a))
│   %14 = Base.broadcasted(Transformers.Basic.:+, x, a)::Any
│         (res_a = Base.materialize(%14))
│   %16 = ($(Expr(:static_parameter, 2)) == 3)::Core.Compiler.Const(true, false)
└──       goto #3 if not %16
2 ─       (insize = Transformers.Basic.size(res_a))
│   %19 = res_a::Any
│   %20 = Base.getindex(insize, 1)::Any
└──       (res_a = Transformers.Basic.reshape(%19, %20, Transformers.Basic.:(:)))
3 ┄ %22 = Base.getproperty(t, :mhn)::Flux.LayerNorm
│         (res_a = (%22)(res_a))
│   %24 = Base.getproperty(t, :pw)::Transformers.Basic.PwFFN
│         (pwffn = (%24)(res_a))
│   %26 = Base.getproperty(t, :drop)::Flux.Dropout
│         (pwffn = (%26)(pwffn))
│   %28 = Base.broadcasted(Transformers.Basic.:+, res_a, pwffn)::Any
│         (res_pwffn = Base.materialize(%28))
│   %30 = Base.getproperty(t, :pwn)::Flux.LayerNorm
│         (res_pwffn = (%30)(res_pwffn))
│   %32 = ($(Expr(:static_parameter, 2)) == 3)::Core.Compiler.Const(true, false)
└──       goto #5 if not %32
4 ─ %34 = Core.tuple(res_pwffn, Transformers.Basic.:(:))::Core.Compiler.PartialStruct(Tuple{Any,Colon}, Any[Any, Core.Compiler.Const(Colon(), false)])
│   %35 = Base.tail::Core.Compiler.Const(Base.tail, false)
│   %36 = (%35)(insize)::Union{Tuple, NamedTuple}
└──       (res_pwffn = Core._apply_iterate(Base.iterate, Transformers.Basic.reshape, %34, %36))
5 ┄       return res_pwffn

The source of the unstabillity is probably the multihead attention, but I have not been able to distill it any further.
I am using latest tagged version 0.1.3 of Transformers on Julia 1.4.1.

BERT pretrain example not working

I tried running the BERT example found here https://github.com/chengchingwen/Transformers.jl/blob/master/example/BERT/_pretrain/pretrain.jl but I got the following error on the train!() line at the bottom of the code.

[ Info: loading pretrain bert model: uncased_L-12_H-768_A-12.tfbson wordpiece
[ Info: loading pretrain bert model: uncased_L-12_H-768_A-12.tfbson tokenizer
[ Info: loading pretrain bert model: uncased_L-12_H-768_A-12.tfbson bert_model
[ Info: start training
[ Info: epoch: 1
ERROR: LoadError: GPU compilation of kernel #broadcast_kernel#15(CUDA.CuKernelContext, CUDA.CuDeviceArray{Float32, 4, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{4}, NTuple{4, Base.OneTo{Int64}}, typeof(+), Tuple{Base.Broadcast.Extruded{CUDA.CuDeviceArray{Float32, 4, 1}, NTuple{4, Bool}, NTuple{4, Int64}}, Base.Broadcast.Extruded{Array{Float32, 4}, NTuple{4, Bool}, NTuple{4, Int64}}}}, Int64) failed
KernelError: passing and using non-bitstype argument

Argument 4 to your kernel function is of type Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{4}, NTuple{4, Base.OneTo{Int64}}, typeof(+), Tuple{Base.Broadcast.Extruded{CUDA.CuDeviceArray{Float32, 4, 1}, NTuple{4, Bool}, NTuple{4, Int64}}, Base.Broadcast.Extruded{Array{Float32, 4}, NTuple{4, Bool}, NTuple{4, Int64}}}}, which is not isbits:
  .args is of type Tuple{Base.Broadcast.Extruded{CUDA.CuDeviceArray{Float32, 4, 1}, NTuple{4, Bool}, NTuple{4, Int64}}, Base.Broadcast.Extruded{Array{Float32, 4}, NTuple{4, Bool}, NTuple{4, Int64}}} which is not isbits.
    .2 is of type Base.Broadcast.Extruded{Array{Float32, 4}, NTuple{4, Bool}, NTuple{4, Int64}} which is not isbits.
      .x is of type Array{Float32, 4} which is not isbits.


Stacktrace:
  [1] check_invocation(job::GPUCompiler.CompilerJob)
    @ GPUCompiler C:\Users\jackn\.julia\packages\GPUCompiler\iaKrd\src\validation.jl:86
  [2] macro expansion
    @ C:\Users\jackn\.julia\packages\GPUCompiler\iaKrd\src\driver.jl:413 [inlined]
  [3] macro expansion
    @ C:\Users\jackn\.julia\packages\TimerOutputs\jgSVI\src\TimerOutput.jl:252 [inlined]
  [4] macro expansion
    @ C:\Users\jackn\.julia\packages\GPUCompiler\iaKrd\src\driver.jl:412 [inlined]
  [5] emit_asm(job::GPUCompiler.CompilerJob, ir::LLVM.Module; strip::Bool, validate::Bool, format::LLVM.API.LLVMCodeGenFileType)
    @ GPUCompiler C:\Users\jackn\.julia\packages\GPUCompiler\iaKrd\src\utils.jl:64
  [6] cufunction_compile(job::GPUCompiler.CompilerJob, ctx::LLVM.Context)
    @ CUDA C:\Users\jackn\.julia\packages\CUDA\tTK8Y\src\compiler\execution.jl:354
  [7] #224
    @ C:\Users\jackn\.julia\packages\CUDA\tTK8Y\src\compiler\execution.jl:347 [inlined]
  [8] JuliaContext(f::CUDA.var"#224#225"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams, GPUCompiler.FunctionSpec{GPUArrays.var"#broadcast_kernel#15", Tuple{CUDA.CuKernelContext, CUDA.CuDeviceArray{Float32, 4, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{4}, NTuple{4, Base.OneTo{Int64}}, typeof(+), Tuple{Base.Broadcast.Extruded{CUDA.CuDeviceArray{Float32, 4, 1}, NTuple{4, Bool}, NTuple{4, Int64}}, Base.Broadcast.Extruded{Array{Float32, 4}, NTuple{4, Bool}, NTuple{4, Int64}}}}, Int64}}}})
    @ GPUCompiler C:\Users\jackn\.julia\packages\GPUCompiler\iaKrd\src\driver.jl:74
  [9] cufunction_compile(job::GPUCompiler.CompilerJob)
    @ CUDA C:\Users\jackn\.julia\packages\CUDA\tTK8Y\src\compiler\execution.jl:346
 [10] cached_compilation(cache::Dict{UInt64, Any}, job::GPUCompiler.CompilerJob, compiler::typeof(CUDA.cufunction_compile), linker::typeof(CUDA.cufunction_link))
    @ GPUCompiler C:\Users\jackn\.julia\packages\GPUCompiler\iaKrd\src\cache.jl:90
 [11] cufunction(f::GPUArrays.var"#broadcast_kernel#15", tt::Type{Tuple{CUDA.CuKernelContext, CUDA.CuDeviceArray{Float32, 4, 1}, Base.Broadcast.Broadcasted{CUDA.CuArrayStyle{4}, NTuple{4, Base.OneTo{Int64}}, typeof(+), Tuple{Base.Broadcast.Extruded{CUDA.CuDeviceArray{Float32, 4, 1}, NTuple{4, Bool}, NTuple{4, Int64}}, Base.Broadcast.Extruded{Array{Float32, 4}, NTuple{4, Bool}, NTuple{4, Int64}}}}, Int64}}; name::Nothing, kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ CUDA C:\Users\jackn\.julia\packages\CUDA\tTK8Y\src\compiler\execution.jl:299
 [12] cufunction
    @ C:\Users\jackn\.julia\packages\CUDA\tTK8Y\src\compiler\execution.jl:293 [inlined]
 [13] macro expansion
    @ C:\Users\jackn\.julia\packages\CUDA\tTK8Y\src\compiler\execution.jl:102 [inlined]
 [14] #launch_heuristic#248
    @ C:\Users\jackn\.julia\packages\CUDA\tTK8Y\src\gpuarrays.jl:17 [inlined]
 [15] _copyto!
    @ C:\Users\jackn\.julia\packages\GPUArrays\EVTem\src\host\broadcast.jl:73 [inlined]
 [16] copyto!
    @ C:\Users\jackn\.julia\packages\GPUArrays\EVTem\src\host\broadcast.jl:56 [inlined]
 [17] copy
    @ C:\Users\jackn\.julia\packages\GPUArrays\EVTem\src\host\broadcast.jl:47 [inlined]
 [18] materialize
    @ .\broadcast.jl:860 [inlined]
 [19] apply_mask(score::CUDA.CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, mask::Array{Float32, 3})
    @ Transformers.Basic C:\Users\jackn\.julia\packages\Transformers\K1F88\src\basic\mh_atten.jl:182
 [20] apply_mask
    @ C:\Users\jackn\.julia\packages\Transformers\K1F88\src\basic\mh_atten.jl:188 [inlined]
 [21] attention(query::CUDA.CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, key::CUDA.CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, value::CUDA.CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, mask::Array{Float32, 3}, future::Bool, dropout::Dropout{Float64, Colon, CUDA.RNG})
    @ Transformers.Basic C:\Users\jackn\.julia\packages\Transformers\K1F88\src\basic\mh_atten.jl:204
 [22] (::Transformers.Basic.MultiheadAttention{Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dropout{Float64, Colon, CUDA.RNG}})(query::CUDA.CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, key::CUDA.CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, value::CUDA.CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}; mask::Array{Float32, 3})
    @ Transformers.Basic C:\Users\jackn\.julia\packages\Transformers\K1F88\src\basic\mh_atten.jl:102
 [23] (::Transformer{Transformers.Basic.MultiheadAttention{Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dropout{Float64, Colon, CUDA.RNG}}, LayerNorm{typeof(identity), Flux.Scale{typeof(identity), CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Float32, 1}, Transformers.Basic.PwFFN{Dense{typeof(gelu), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, LayerNorm{typeof(identity), Flux.Scale{typeof(identity), CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Float32, 1}, Dropout{Float64, Colon, CUDA.RNG}})(x::CUDA.CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, mask::Array{Float32, 3})
    @ Transformers.Basic C:\Users\jackn\.julia\packages\Transformers\K1F88\src\basic\transformer.jl:69
 [24] macro expansion
    @ C:\Users\jackn\.julia\packages\Transformers\K1F88\src\stacks\stack.jl:0 [inlined]
 [25] (::Stack{Symbol("((x, m) => x':(x, m)) => 12"), NTuple{12, Transformer{Transformers.Basic.MultiheadAttention{Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dropout{Float64, Colon, CUDA.RNG}}, LayerNorm{typeof(identity), Flux.Scale{typeof(identity), CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Float32, 1}, Transformers.Basic.PwFFN{Dense{typeof(gelu), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, LayerNorm{typeof(identity), Flux.Scale{typeof(identity), CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Float32, 1}, Dropout{Float64, Colon, CUDA.RNG}}}})(::CUDA.CuArray{Float32, 3, 
CUDA.Mem.DeviceBuffer}, ::Array{Float32, 3})
    @ Transformers.Stacks C:\Users\jackn\.julia\packages\Transformers\K1F88\src\stacks\stack.jl:19
 [26] (::Bert{Stack{Symbol("((x, m) => x':(x, m)) => 12"), NTuple{12, Transformer{Transformers.Basic.MultiheadAttention{Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, 
CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 
1, CUDA.Mem.DeviceBuffer}}, Dropout{Float64, Colon, CUDA.RNG}}, LayerNorm{typeof(identity), Flux.Scale{typeof(identity), CUDA.CuArray{Float32, 1, 
CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Float32, 1}, Transformers.Basic.PwFFN{Dense{typeof(gelu), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Dense{typeof(identity), CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, LayerNorm{typeof(identity), Flux.Scale{typeof(identity), CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, CUDA.CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, Float32, 1}, Dropout{Float64, Colon, CUDA.RNG}}}}, Dropout{Float64, Colon, CUDA.RNG}})(x::CUDA.CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}, mask::Array{Float32, 3}; all::Bool)
    @ Transformers.BidirectionalEncoder C:\Users\jackn\.julia\packages\Transformers\K1F88\src\bert\bert.jl:55
 [27] Bert
    @ C:\Users\jackn\.julia\packages\Transformers\K1F88\src\bert\bert.jl:50 [inlined]
 [28] loss(data::NamedTuple{(:tok, :segment), Tuple{Matrix{Int64}, Matrix{Int64}}}, ind::Vector{Tuple{Int64, Int64}}, masklabel::Flux.OneHotArray{UInt32, 30522, 1, 2, Vector{UInt32}}, nextlabel::Flux.OneHotArray{UInt32, 2, 1, 2, Vector{UInt32}}, mask::Array{Float32, 3})

Thanks. Not really an issue, but a question.

I ran across this using the new Julia Repository Search engine, but only after I built my own (on Julia Discourse and GitHub). I'm having some trouble getting mine to converge on anything but the "copy" task. Yours does converge on my "stutter" task, which is a derivative of copy, except with select tokens duplicated. I've been using your code and can't find anything that explains the difference. However, there was one thing that I did not understand. Is it necessary (and why) to create a custom gradient for the Word Embedding? I simply indexed a TrackedArray. Is that a problem?
Thanks for the Transformer, though. It seems very well done.

Error in test

On Julia master and Transformers master I get an error when running the tests:

[ Info: Test EMBED
Gather: Error During Test at /home/gkraemer/.julia/dev/Transformers/test/embed/gather.jl:1
  Got exception outside of a @test
  MethodError: no method matching Core.Compiler.IRCode(::Vector{Any}, ::Vector{Any}, ::Vector{Int32}, ::Vector{UInt8}, ::Core.Compiler.CFG, ::Vector{Core.LineInfoNode}, ::Vector{Any}, ::Vector{Any}, ::Vector{Any})
  Stacktrace:
    [1] Core.Compiler.IRCode(ir::IRTools.Inner.IR)
      @ IRTools.Inner.Wrap ~/.julia/packages/IRTools/GVPoj/src/ir/wrap.jl:55
    [2] update!(ci::Core.CodeInfo, ir::IRTools.Inner.IR)
      @ IRTools.Inner ~/.julia/packages/IRTools/GVPoj/src/reflection/utils.jl:143
    [3] #s2812#1248
      @ ~/.julia/packages/Zygote/jQK65/src/compiler/interface2.jl:34 [inlined]
    [4] var"#s2812#1248"(::Any, ctx::Any, f::Any, args::Any)
      @ Zygote ./none:0
    [5] (::Core.GeneratedFunctionStub)(::Any, ::Vararg{Any, N} where N)
      @ Core ./boot.jl:571
    [6] _pullback(f::Function, args::Matrix{Float64})
      @ Zygote ~/.julia/packages/Zygote/jQK65/src/compiler/interface.jl:38
    [7] pullback(f::Function, args::Matrix{Float64})
      @ Zygote ~/.julia/packages/Zygote/jQK65/src/compiler/interface.jl:44
    [8] gradient(f::Function, args::Matrix{Float64})
      @ Zygote ~/.julia/packages/Zygote/jQK65/src/compiler/interface.jl:53
    [9] macro expansion
      @ ~/.julia/dev/Transformers/test/embed/gather.jl:25 [inlined]
   [10] macro expansion
      @ ~/progs/julia/julia-master/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1144 [inlined]
   [11] top-level scope
      @ ~/.julia/dev/Transformers/test/embed/gather.jl:2
   [12] include(fname::String)
      @ Base.MainInclude ./client.jl:444
   [13] macro expansion
      @ ~/.julia/dev/Transformers/test/test_embed.jl:3 [inlined]
   [14] macro expansion
      @ ~/progs/julia/julia-master/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1144 [inlined]
   [15] top-level scope
      @ ~/.julia/dev/Transformers/test/test_embed.jl:2
   [16] include(fname::String)
      @ Base.MainInclude ./client.jl:444
   [17] macro expansion
      @ ~/.julia/dev/Transformers/test/runtests.jl:39 [inlined]
   [18] macro expansion
      @ ~/progs/julia/julia-master/usr/share/julia/stdlib/v1.6/Test/src/Test.jl:1144 [inlined]
   [19] top-level scope
      @ ~/.julia/dev/Transformers/test/runtests.jl:29
   [20] include(fname::String)
      @ Base.MainInclude ./client.jl:444
   [21] top-level scope
      @ none:6
   [22] eval(m::Module, e::Any)
      @ Core ./boot.jl:360
   [23] exec_options(opts::Base.JLOptions)
      @ Base ./client.jl:261
   [24] _start()
      @ Base ./client.jl:485
┌ Warning: concat OneHot{10} along dimension 1.
└ @ Transformers.Basic ~/.julia/dev/Transformers/src/basic/embeds/onehot.jl:156
┌ Warning: concat OneHot{10} along dimension 1.
└ @ Transformers.Basic ~/.julia/dev/Transformers/src/basic/embeds/onehot.jl:156
┌ Warning: concat OneHotArray{10} along dimension 1.
└ @ Transformers.Basic ~/.julia/dev/Transformers/src/basic/embeds/onehot.jl:173
┌ Warning: concat OneHotArray{10} along dimension 1.
└ @ Transformers.Basic ~/.julia/dev/Transformers/src/basic/embeds/onehot.jl:173

I have tried updating Zygote.jl to latest master without success.

Feature Request: Memory-Efficient Attention

From https://arxiv.org/abs/2112.05682v2. I have no immediate use for this, but it looks cool and I didn't want it to go unmentioned in case some aspiring contributor to Transformers.jl is looking for a project :)

parse args fails

Also, this line not longer works.

Transformers.jl/example/AttentionIsAllYouNeed/0-data.jl

Line 23 in 5478c02

return parse_args(ARGS, s)

with the following env

(transformersApproach) pkg> st
      Status `~/transformersApproach/Project.toml`
  [c7e460c6] ArgParse v1.1.4
  [052768ef] CUDA v3.3.4
  [587475ba] Flux v0.12.5
  [21ca0261] Transformers v0.1.13
  [796a5d58] WordTokenizers v0.5.6
  [e88e6eb3] Zygote v0.6.17

Update docs and examples for onehot/vocabulary with Int input

I'm testing some code from the docs. In this case the copy task.

https://chengchingwen.github.io/Transformers.jl/dev/tutorial/#define-the-loss-and-training-loop

I notice that things go wrong in the loss function definition. Due to the onehot method.

By running that code one gets this error.

ERROR: LoadError: MethodError: no method matching OneHotArray(::Transformers.Basic.OneHot{0x0000000d})
Closest candidates are:
  OneHotArray(::Any, ::Transformers.Basic.OneHot)

this the env that I'm using.

(transformersApproach) pkg> st
      Status `~/transformersApproach/Project.toml`
  [052768ef] CUDA v3.3.4
  [587475ba] Flux v0.12.5
  [21ca0261] Transformers v0.1.13
  [e88e6eb3] Zygote v0.6.17

any quick work around this issue?

Evaluation into the closed module `Transformers` breaks incremental compilation (enable_gpu)

module Outer

using Transformers
enable_gpu()

end

On the REPL:

using Outer

[ Info: Precompiling Outer [7d29fca4-1b1c-4df5-92e9-f71c99fd1fc4]
ERROR: LoadError: Evaluation into the closed module `Transformers` breaks incremental compilation because the side effects will not be permanent. This is likely due to some other module mutating `Transformers` with `eval` during precompilation - don't do this.
Stacktrace:
  [1] eval
    @ .\boot.jl:373 [inlined]
  [2] enable_gpu(t::Bool)
    @ Transformers C:\Users\Admin\.julia\packages\Transformers\dAmXK\src\Transformers.jl:45
  [3] enable_gpu()
    @ Transformers C:\Users\Admin\.julia\packages\Transformers\dAmXK\src\Transformers.jl:43
  [4] top-level scope
    @ C:\Users\Admin\Desktop\work\tweet-classification\Outer\src\Outer.jl:4
  [5] include
    @ .\Base.jl:418 [inlined]
  [6] include_package_for_output(pkg::Base.PkgId, input::String, depot_path::Vector{String}, dl_load_path::Vector{String}, load_path::Vector{String}, concrete_deps::Vector{Pair{Base.PkgId, UInt64}}, source::Nothing)
    @ Base .\loading.jl:1318
  [7] top-level scope
    @ none:1
  [8] eval
    @ .\boot.jl:373 [inlined]
  [9] eval(x::Expr)
    @ Base.MainInclude .\client.jl:453
 [10] top-level scope
    @ none:1
in expression starting at C:\Users\Admin\Desktop\work\tweet-classification\Outer\src\Outer.jl:1
ERROR: Failed to precompile Outer [7d29fca4-1b1c-4df5-92e9-f71c99fd1fc4] to C:\Users\Admin\.julia\compiled\v1.7\Outer\jl_BFE0.tmp.
Stacktrace:
 [1] error(s::String)
   @ Base .\error.jl:33
 [2] compilecache(pkg::Base.PkgId, path::String, internal_stderr::IO, internal_stdout::IO, ignore_loaded_modules::Bool)
   @ Base .\loading.jl:1466
 [3] compilecache(pkg::Base.PkgId, path::String)
   @ Base .\loading.jl:1410
 [4] _require(pkg::Base.PkgId)
   @ Base .\loading.jl:1120
 [5] require(uuidkey::Base.PkgId)
   @ Base .\loading.jl:1013
 [6] require(into::Module, mod::Symbol)
   @ Base .\loading.jl:997

The culprit is the @eval inside Transformers.jl

function enable_gpu(t::Bool=true)
    if t
        CUDA.functional() || error("CUDA not functional")
        @eval todevice(args...) = togpudevice(args...)
    else
        @eval todevice(args...) = tocpudevice(args...)
    end
end

No need to reshape?

It appears that to pass input to the LayerNorm, the tensor is reshaped into a 2D matrix (feature_size x (sequence_length x batch_size), then reshaped back after all the norm layers are done operating? I think this happens in multiple places (i.e. the @toNd macro).

Transformers.jl/src/basic/transformer.jl

Line 61 in fbc8bb3

function (t::Transformer)(x::AbstractArray{T, N}, mask=nothing) where {T, N}

Based on a recent Zulip topic, I think this isn't required due to Julia's broadcasting machinery.

julia> x = rand(3, 2, 10)

julia> d = Flux.Diagonal(3)
Diagonal(3)

julia> d(Flux.normalise(x; dims = 1)) == reshape(d(Flux.normalise(reshape(x, 3, :); dims = 1)), 3, 2, 10)
true

As seen above, the LayerNorm body applied to a 3D tensor and reshaped tensor result in the same output.

possible issues in PositionEmbedding ?

Thank you very much for making your work available as a Julia package!
It helps me a lot to learn about Transformers (a topic which I don't know much...)
I was comparing:

Transformers.jl/src/basic/embeds/position_embed.jl

Lines 19 to 25 in 7a7a25a

 function PE(size, pos, i::Int) 

 if rem(i, 2) == 0 

 sin(pos/1e4^(i/size)) 

 else 

 cos(pos/1e4^((i-1)/size)) 

 end 

 end

to
https://machinelearningmastery.com/a-gentle-introduction-to-positional-encoding-in-transformer-models-part-1/

It seems to me that we have cos(pos) (for i = 1) in embedding but we do not have its pair sin(pos) as in index i is 1-based. If we would shift i by 1 then we would have all sin/cos pairs.

Of course, this is a very new topic too me and I might be completely wrong here (or it simply does not matter...).
I can make a PR this is indeed an issue.

Bert Cola Example Missing Import

In the example at the link below, line 36 yields the error that todevice is not defined. This is because there is no general import which includes that method such as using Transformers which includes that method in the file. This could easily be fixed by including using Transformers at the top of the file.

https://github.com/chengchingwen/Transformers.jl/blob/master/example/BERT/cola/train.jl

SystemError: Permission Denied (hgf)

bert_model = hgf"bert-base-cased:forsequenceclassification";

ERROR: SystemError: opening file "C:\\Users\\Admin\\.julia\\packages\\Transformers\\JJhzu\\src\\huggingface\\Artifacts.toml": Permission denied
Stacktrace:
  [1] systemerror(p::String, errno::Int32; extrainfo::Nothing)
    @ Base .\error.jl:174
  [2] #systemerror#68
    @ .\error.jl:173 [inlined]
  [3] systemerror
    @ .\error.jl:173 [inlined]
  [4] open(fname::String; lock::Bool, read::Nothing, write::Nothing, create::Nothing, truncate::Bool, append::Nothing)
    @ Base .\iostream.jl:293
  [5] open(fname::String, mode::String; lock::Bool)
    @ Base .\iostream.jl:355
  [6] open(fname::String, mode::String)
    @ Base .\iostream.jl:355
  [7] open(::Pkg.Artifacts.var"#9#13"{Dict{String, Any}}, ::String, ::Vararg{String}; kwargs::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
    @ Base .\io.jl:328
  [8] open
    @ .\io.jl:328 [inlined]
  [9] bind_artifact!(artifacts_toml::String, name::String, hash::Base.SHA1; platform::Nothing, download_info::Nothing, lazy::Bool, force::Bool)
    @ Pkg.Artifacts C:\Users\Admin\AppData\Local\Programs\Julia-1.7.3\share\julia\stdlib\v1.7\Pkg\src\Artifacts.jl:244
 [10] bind_artifact!
    @ C:\Users\Admin\AppData\Local\Programs\Julia-1.7.3\share\julia\stdlib\v1.7\Pkg\src\Artifacts.jl:187 [inlined]
 [11] find_or_register_hgf_file_hash(path::String, model_name::SubString{String}, file_name::String)
    @ Transformers.HuggingFace C:\Users\Admin\.julia\packages\Transformers\JJhzu\src\huggingface\download.jl:144
 [12] get_or_download_hgf_file
    @ C:\Users\Admin\.julia\packages\Transformers\JJhzu\src\huggingface\download.jl:86 [inlined]
 [13] #get_or_download_hgf_config#3
    @ C:\Users\Admin\.julia\packages\Transformers\JJhzu\src\huggingface\download.jl:66 [inlined]
 [14] get_or_download_hgf_config
    @ C:\Users\Admin\.julia\packages\Transformers\JJhzu\src\huggingface\download.jl:66 [inlined]
 [15] load_hgf_pretrained(name::String)
    @ Transformers.HuggingFace C:\Users\Admin\.julia\packages\Transformers\JJhzu\src\huggingface\HuggingFace.jl:52
 [16] init()
    @ TweetSentiment C:\Users\Admin\Desktop\work\TweetSentiment\src\TweetSentiment.jl:12
 [17] top-level scope
    @ REPL[4]:1
 [18] top-level scope
    @ C:\Users\Admin\.julia\packages\CUDA\DfvRa\src\initialization.jl:52

Any pointers?

Pkg status

pkg> status
     Project TweetSentiment v0.1.0
      Status `C:\Users\Admin\Desktop\work\TweetSentiment\Project.toml`
  [052768ef] CUDA v3.12.0
  [587475ba] Flux v0.13.4
  [438e738f] PyCall v1.93.1
  [21ca0261] Transformers v0.1.18 `https://github.com/chengchingwen/Transformers.jl.git#master`

Downloading of sentence transformers

Basic downloading doesn't work for sentence-transformers. Is it possible to download model using direct url or url for config.json?

TagBot trigger issue

This issue is used to trigger TagBot; feel free to unsubscribe.

If you haven't already, you should update your TagBot.yml to include issue comment triggers.
Please see this post on Discourse for instructions and more details.

If you'd like for me to do this for you, comment TagBot fix on this issue.
I'll open a PR within a few hours, please be patient!

More Transformer architectures

Hi! We're implementing some transformer architectures like Gated Transformer Networks for Multivariate Time Series Classification. Are you interested in pulling more examples like this?

Why "AbstractEmbed" can be used in this file withoud definition or importing.

Transformers.jl/src/basic/embeds/embed.jl

Line 6 in e7e7b74

struct Embed{F ,W <: AbstractArray{F}} <: AbstractEmbed{F}

Redesign `AbstractHGFConfig` loading with JSON3.jl

mentioned in #9.

Failing Test

On current Julia master and Transformers master I have a failing test:

[ Info: Test BASIC
Basic: Test Failed at /home/gkraemer/.julia/dev/Transformers/test/test_basic.jl:10
  Expression: (p(x))[:, :, 2] == c(x[:, :, 2])
   Evaluated: Float32[0.027287507 0.16296661 … 0.2492562 0.4740091; 0.05988739 0.42276305 … 0.2984651 0.15890877; … ; 0.8943589 0.07297276 … 0.21390636 0.12822512; 0.0018252898 0.32664314 … 0.16835916 0.13867101] == Float32[0.027287507 0.16296661 … 0.2492562 0.47400913; 0.05988739 0.42276305 … 0.2984651 0.15890875; … ; 0.8943589 0.07297276 … 0.21390636 0.12822512; 0.0018252898 0.32664314 … 0.16835916 0.138671]

julia> using Transformers
julia> using Flux
julia> d = Flux.Dense(10, 5)
Dense(10, 5)
julia> c = Flux.Chain(d, Flux.softmax)
Chain(Dense(10, 5), softmax)
julia> p = Transformers.Positionwise(d, Flux.softmax)
Positionwise(Dense(10, 5), softmax)
julia> x = randn(10, 5, 3)
julia> p(x)[:, :, 2] .- c(x[:, :, 2])
5×5 Matrix{Float32}:
 0.0  0.0  0.0  0.0  -7.45058f-9
 0.0  0.0  0.0  0.0  -1.49012f-8
 0.0  0.0  0.0  0.0   0.0
 0.0  0.0  0.0  0.0  -2.79397f-9
 0.0  0.0  0.0  0.0   0.0

Not sure if these should be identical or if you can change the test from == to ≈

Running pretrained bert model causes error

bert_model, wordpiece, tokenizer = pretrain"bert-uncased_L-12_H-768_A-12"
ERROR: TypeError: in keyword argument cookiejar, expected Dict{String,Set{HTTP.Cookies.Cookie}}, got Array{Dict{String,Set{HTTP.Cookies.Cookie}},1}
Stacktrace:
 [1] (::getfield(HTTP, Symbol("#kw##request")))(::NamedTuple{(:reached_redirect_limit, :cookies, :cookiejar),Tuple{Bool,Bool,Array{Dict{String,Set{HTTP.Cookies.Cookie}},1}}}, ::typeof(HTTP.request), ::Type{HTTP.CookieRequest.CookieLayer{HTTP.MessageRequest.MessageLayer{HTTP.RetryRequest.RetryLayer{HTTP.ExceptionRequest.ExceptionLayer{HTTP.ConnectionRequest.ConnectionPoolLayer{HTTP.StreamRequest.StreamLayer{Union{}}}}}}}}, ::String, ::HTTP.URIs.URI, ::Array{Pair{SubString{String},SubString{String}},1}, ::Array{UInt8,1}) at ./none:0
 [2] #request#1(::Base.Iterators.Pairs{Symbol,Any,Tuple{Symbol,Symbol,Symbol},NamedTuple{(:reached_redirect_limit, :cookies, :cookiejar),Tuple{Bool,Bool,Array{Dict{String,Set{HTTP.Cookies.Cookie}},1}}}}, ::Function, ::Type{HTTP.BasicAuthRequest.BasicAuthLayer{HTTP.CookieRequest.CookieLayer{HTTP.MessageRequest.MessageLayer{HTTP.RetryRequest.RetryLayer{HTTP.ExceptionRequest.ExceptionLayer{HTTP.ConnectionRequest.ConnectionPoolLayer{HTTP.StreamRequest.StreamLayer{Union{}}}}}}}}}, ::String, ::HTTP.URIs.URI, ::Array{Pair{SubString{String},SubString{String}},1}, ::Array{UInt8,1}) at /home/zzj0402/.julia/packages/HTTP/ZggHU/src/BasicAuthRequest.jl:28
 [3] (::getfield(HTTP, Symbol("#kw##request")))(::NamedTuple{(:reached_redirect_limit, :cookies, :cookiejar),Tuple{Bool,Bool,Array{Dict{String,Set{HTTP.Cookies.Cookie}},1}}}, ::typeof(HTTP.request), ::Type{HTTP.BasicAuthRequest.BasicAuthLayer{HTTP.CookieRequest.CookieLayer{HTTP.MessageRequest.MessageLayer{HTTP.RetryRequest.RetryLayer{HTTP.ExceptionRequest.ExceptionLayer{HTTP.ConnectionRequest.ConnectionPoolLayer{HTTP.StreamRequest.StreamLayer{Union{}}}}}}}}}, ::String, ::HTTP.URIs.URI, ::Array{Pair{SubString{String},SubString{String}},1}, ::Array{UInt8,1}) at ./none:0
 [4] #request#1(::Int64, ::Bool, ::Base.Iterators.Pairs{Symbol,Any,Tuple{Symbol,Symbol},NamedTuple{(:cookies, :cookiejar),Tuple{Bool,Array{Dict{String,Set{HTTP.Cookies.Cookie}},1}}}}, ::Function, ::Type{HTTP.RedirectRequest.RedirectLayer{HTTP.BasicAuthRequest.BasicAuthLayer{HTTP.CookieRequest.CookieLayer{HTTP.MessageRequest.MessageLayer{HTTP.RetryRequest.RetryLayer{HTTP.ExceptionRequest.ExceptionLayer{HTTP.ConnectionRequest.ConnectionPoolLayer{HTTP.StreamRequest.StreamLayer{Union{}}}}}}}}}}, ::String, ::HTTP.URIs.URI, ::Array{Pair{SubString{String},SubString{String}},1}, ::Array{UInt8,1}) at /home/zzj0402/.julia/packages/HTTP/ZggHU/src/RedirectRequest.jl:24
 [5] (::getfield(HTTP, Symbol("#kw##request")))(::NamedTuple{(:cookies, :cookiejar),Tuple{Bool,Array{Dict{String,Set{HTTP.Cookies.Cookie}},1}}}, ::typeof(HTTP.request), ::Type{HTTP.RedirectRequest.RedirectLayer{HTTP.BasicAuthRequest.BasicAuthLayer{HTTP.CookieRequest.CookieLayer{HTTP.MessageRequest.MessageLayer{HTTP.RetryRequest.RetryLayer{HTTP.ExceptionRequest.ExceptionLayer{HTTP.ConnectionRequest.ConnectionPoolLayer{HTTP.StreamRequest.StreamLayer{Union{}}}}}}}}}}, ::String, ::HTTP.URIs.URI, ::Array{Pair{SubString{String},SubString{String}},1}, ::Array{UInt8,1}) at ./none:0
 [6] #request#4(::Array{Pair{SubString{String},SubString{String}},1}, ::Array{UInt8,1}, ::Nothing, ::Base.Iterators.Pairs{Symbol,Any,Tuple{Symbol,Symbol},NamedTuple{(:cookies, :cookiejar),Tuple{Bool,Array{Dict{String,Set{HTTP.Cookies.Cookie}},1}}}}, ::Function, ::String, ::String, ::Array{Pair{SubString{String},SubString{String}},1}, ::Array{UInt8,1}) at /home/zzj0402/.julia/packages/HTTP/ZggHU/src/HTTP.jl:314
 [7] #request at ./none:0 [inlined] (repeats 2 times)
 [8] download_gdrive(::String, ::String) at /home/zzj0402/.julia/packages/Transformers/2BZJs/src/datasets/download_utils.jl:56
 [9] run_fetch(::typeof(Transformers.Datasets.download_gdrive), ::String, ::String) at /home/zzj0402/.julia/packages/DataDeps/gbyoa/src/resolution_automatic.jl:99
 [10] #download#13(::String, ::Nothing, ::Bool, ::Function, ::DataDeps.DataDep{String,String,typeof(Transformers.Datasets.download_gdrive),typeof(identity)}, ::String) at /home/zzj0402/.julia/packages/DataDeps/gbyoa/src/resolution_automatic.jl:78
 [11] download(::DataDeps.DataDep{String,String,typeof(Transformers.Datasets.download_gdrive),typeof(identity)}, ::String) at /home/zzj0402/.julia/packages/DataDeps/gbyoa/src/resolution_automatic.jl:70
 [12] handle_missing(::DataDeps.DataDep{String,String,typeof(Transformers.Datasets.download_gdrive),typeof(identity)}, ::String) at /home/zzj0402/.julia/packages/DataDeps/gbyoa/src/resolution_automatic.jl:10
 [13] _resolve(::DataDeps.DataDep{String,String,typeof(Transformers.Datasets.download_gdrive),typeof(identity)}, ::String) at /home/zzj0402/.julia/packages/DataDeps/gbyoa/src/resolution.jl:83
 [14] resolve(::DataDeps.DataDep{String,String,typeof(Transformers.Datasets.download_gdrive),typeof(identity)}, ::String, ::String) at /home/zzj0402/.julia/packages/DataDeps/gbyoa/src/resolution.jl:29
 [15] resolve(::String, ::String, ::String) at /home/zzj0402/.julia/packages/DataDeps/gbyoa/src/resolution.jl:54
 [16] resolve at /home/zzj0402/.julia/packages/DataDeps/gbyoa/src/resolution.jl:73 [inlined]
 [17] #load_pretrain#3(::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}, ::Function, ::String) at /home/zzj0402/.julia/packages/Transformers/2BZJs/src/pretrain/Pretrain.jl:64
 [18] load_pretrain(::String) at /home/zzj0402/.julia/packages/Transformers/2BZJs/src/pretrain/Pretrain.jl:53
 [19] top-level scope at none:0

Module Broken?

(@JuliaPro_v1.5.0-1)

julia> using Transformers
[ Info: Precompiling Transformers [21ca0261-441d-5938-ace7-c90938fde4d4]
WARNING: Method definition _pullback(Zygote.Context, typeof(Flux.onehot), Any...) in module Flux at C:\Users\xxx\.julia\packages\Zygote\rqvFi\src\lib\grad.jl:8 overwritten in module Basic at C:\Users\xxx\.julia\packages\Zygote\rqvFi\src\lib\grad.jl:8.
  ** incremental compilation may be fatally broken for this module **

Transformer tutorial is outdated

I get a ERROR: syntax: invalid syntax "preprocess.(sample_data()) = ..." on the last line when I run the code below. The code is from the tutorial on this page https://chengchingwen.github.io/Transformers.jl/dev/tutorial/#Transformers.jl.

using Transformers
using Transformers.Basic
using Transformers.Pretrain
using CUDA
using Flux
enable_gpu(true)

labels = map(string, 1:10)
startsym = "11"
endsym = "12"
unksym = "0"
labels = [unksym, startsym, endsym, labels...]
vocab = Vocabulary(labels, unksym)

#function for generate training datas
sample_data() = (d = map(string, rand(1:10, 10)); (d,d))
#function for adding start & end symbol
preprocess(x) = [startsym, x..., endsym]

@show sample = preprocess.(sample_data())
@show encoded_sample = vocab(sample[1]) #use Vocabulary to encode the training data

sample = preprocess.(sample_data()) = (["11", "10", "8", "1", "10", "7", "10", "4", "2", "3", "3", "12"], ["11", "10", "8", "1", "10", "7", "10", "4", "2", "3", "3", "12"])

Enable CI

Please enable CI for this repository, e.g. Travis/Appveyor.

Transformers.jl Package Installation Broken!

I have tried several times deleting the transformers package in /packages, making a new virtual environment (activate newenv), adding CUDA and Transformers, but every time when I run using transformers, I get the following output.

julia> using Transformers
[ Info: Precompiling Transformers [21ca0261-441d-5938-ace7-c90938fde4d4]
ERROR: LoadError: UndefVarError: applychain not defined
Stacktrace:
  [1] include(mod::Module, _path::String)
    @ Base .\Base.jl:418
  [2] include(x::String)
    @ Transformers.Basic C:\Users\jackn\.julia\packages\Transformers\K1F88\src\basic\Basic.jl:1
  [3] top-level scope
    @ C:\Users\jackn\.julia\packages\Transformers\K1F88\src\basic\Basic.jl:25
  [4] include(mod::Module, _path::String)
    @ Base .\Base.jl:418
  [5] include(x::String)
    @ Transformers C:\Users\jackn\.julia\packages\Transformers\K1F88\src\Transformers.jl:1
  [6] top-level scope
    @ C:\Users\jackn\.julia\packages\Transformers\K1F88\src\Transformers.jl:70
  [7] include
    @ .\Base.jl:418 [inlined]
  [8] include_package_for_output(pkg::Base.PkgId, input::String, depot_path::Vector{String}, dl_load_path::Vector{String}, load_path::Vector{String}, concrete_deps::Vector{Pair{Base.PkgId, UInt64}}, source::Nothing)
    @ Base .\loading.jl:1318
  [9] top-level scope
    @ none:1
 [10] eval
    @ .\boot.jl:373 [inlined]
 [11] eval(x::Expr)
    @ Base.MainInclude .\client.jl:453
 [12] top-level scope
    @ none:1
in expression starting at C:\Users\jackn\.julia\packages\Transformers\K1F88\src\basic\extend3d.jl:4
in expression starting at C:\Users\jackn\.julia\packages\Transformers\K1F88\src\basic\Basic.jl:1
in expression starting at C:\Users\jackn\.julia\packages\Transformers\K1F88\src\Transformers.jl:1
ERROR: Failed to precompile Transformers [21ca0261-441d-5938-ace7-c90938fde4d4] to C:\Users\jackn\.julia\compiled\v1.7\Transformers\jl_FB9C.tmp.  
Stacktrace:
  [1] error(s::String)
    @ Base .\error.jl:33
  [2] compilecache(pkg::Base.PkgId, path::String, internal_stderr::IO, internal_stdout::IO, ignore_loaded_modules::Bool)
    @ Base .\loading.jl:1466
  [3] compilecache(pkg::Base.PkgId, path::String)
    @ Base .\loading.jl:1410
  [4] _require(pkg::Base.PkgId)
    @ Base .\loading.jl:1120
  [5] require(uuidkey::Base.PkgId)
    @ Base .\loading.jl:1013
  [6] require(into::Module, mod::Symbol)
    @ Base .\loading.jl:997
  [7] eval
    @ .\boot.jl:373 [inlined]
  [8] eval
    @ .\Base.jl:68 [inlined]
  [9] repleval(m::Module, code::Expr, #unused#::String)
    @ VSCodeServer c:\Users\jackn\.vscode\extensions\julialang.language-julia-1.6.24\scripts\packages\VSCodeServer\src\repl.jl:157
 [10] (::VSCodeServer.var"#78#80"{Module, Expr, REPL.LineEditREPL, REPL.LineEdit.Prompt})()
    @ VSCodeServer c:\Users\jackn\.vscode\extensions\julialang.language-julia-1.6.24\scripts\packages\VSCodeServer\src\repl.jl:123
 [11] with_logstate(f::Function, logstate::Any)
    @ Base.CoreLogging .\logging.jl:511
 [12] with_logger
    @ .\logging.jl:623 [inlined]
 [13] (::VSCodeServer.var"#77#79"{Module, Expr, REPL.LineEditREPL, REPL.LineEdit.Prompt})()
    @ VSCodeServer c:\Users\jackn\.vscode\extensions\julialang.language-julia-1.6.24\scripts\packages\VSCodeServer\src\repl.jl:124
 [14] #invokelatest#2
    @ .\essentials.jl:716 [inlined]
 [15] invokelatest(::Any)
    @ Base .\essentials.jl:714
 [16] macro expansion
    @ c:\Users\jackn\.vscode\extensions\julialang.language-julia-1.6.24\scripts\packages\VSCodeServer\src\eval.jl:34 [inlined]
 [17] (::VSCodeServer.var"#60#61")()
    @ VSCodeServer .\task.jl:423

MethodError: no method matching AbstractFloat(::Type{Any}) with HuggingFace.GPT2

I was working with HuggingFace GPT2. I am trying to differentiate some loss on logits of GPT2 with respect to past_key_values pre-calculated. While the forward pass works correctly, I am getting the following error in backward pass:

MethodError: no method matching AbstractFloat(::Type{Any})
Closest candidates are:
  (::Type{T})(::AbstractChar) where T<:Union{AbstractChar, Number} at char.jl:50
  (::Type{T})(::Base.TwicePrecision) where T<:Number at twiceprecision.jl:243
  (::Type{T})(::Flux.NilNumber.Nil) where T<:Number at /home/adarshkumar712/.julia/packages/Flux/0c9kI/src/outputsize.jl:17
  ...

Stacktrace:
  [1] float(x::Type)
    @ Base ./float.jl:206
  [2] (::ComposedFunction{typeof(float), typeof(eltype)})(x::Function)
    @ Base ./operators.jl:938
  [3] softmax(x::Function; dims::Int64)
    @ NNlib ~/.julia/packages/NNlib/3MZcC/src/softmax.jl:48
  [4] rrule(::typeof(softmax), xs::Function; dims::Int64)
    @ NNlib ~/.julia/packages/NNlib/3MZcC/src/softmax.jl:80
  [5] chain_rrule_kw
    @ ~/.julia/packages/Zygote/zowrf/src/compiler/chainrules.jl:101 [inlined]
  [6] macro expansion
    @ ~/.julia/packages/Zygote/zowrf/src/compiler/interface2.jl:0 [inlined]
  [7] _pullback(::Zygote.Context, ::NNlib.var"#softmax##kw", ::NamedTuple{(:dims,), Tuple{Int64}}, ::typeof(softmax), ::typeof(Transformers.HuggingFace.apply_shift_mask))
    @ Zygote ~/.julia/packages/Zygote/zowrf/src/compiler/interface2.jl:9
  [8] _pullback
    @ ~/.julia/packages/Transformers/rCnGb/src/huggingface/models/gpt2.jl:124 [inlined]
  [9] _pullback(::Zygote.Context, ::typeof(Transformers.HuggingFace._attn), ::Array{Float32, 4}, ::Array{Float32, 4}, ::Array{Float32, 4}, ::Transformers.HuggingFace.ShiftAttentionMask{Array{Float32, 4}})
    @ Zygote ~/.julia/packages/Zygote/zowrf/src/compiler/interface2.jl:0
 [10] _pullback
    @ ~/.julia/packages/Transformers/rCnGb/src/huggingface/models/gpt2.jl:138 [inlined]
 [11] _pullback(::Zygote.Context, ::Transformers.HuggingFace.HGFGPT2Attention{Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}, Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}}, ::Array{Float32, 4}, ::Array{Float32, 4}, ::Array{Float32, 4}, ::Transformers.HuggingFace.ShiftAttentionMask{Array{Float32, 4}}, ::Val{false}, ::Val{true})
    @ Zygote ~/.julia/packages/Zygote/zowrf/src/compiler/interface2.jl:0
 [12] _pullback
    @ ~/.julia/packages/Transformers/rCnGb/src/huggingface/models/gpt2.jl:97 [inlined]
 [13] _pullback(::Zygote.Context, ::Transformers.HuggingFace.HGFGPT2Attention{Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}, Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}}, ::Array{Float32, 3}, ::Tuple{Array{Float32, 4}, Array{Float32, 4}}, ::Transformers.HuggingFace.ShiftAttentionMask{Array{Float32, 4}}, ::Val{false}, ::Val{true})
    @ Zygote ~/.julia/packages/Zygote/zowrf/src/compiler/interface2.jl:0
 [14] _pullback
    @ ~/.julia/packages/Transformers/rCnGb/src/huggingface/models/gpt2.jl:224 [inlined]
 [15] _pullback(::Zygote.Context, ::Transformers.HuggingFace.HGFGPT2Block{Transformers.HuggingFace.HGFGPT2Attention{Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}, Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}, Transformers.HuggingFace.HGFGPT2MLP{typeof(gelu), Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}, Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}}}, ::Array{Float32, 3}, ::Tuple{Array{Float32, 4}, Array{Float32, 4}}, ::Transformers.HuggingFace.ShiftAttentionMask{Array{Float32, 4}}, ::Val{false}, ::Val{true})
    @ Zygote ~/.julia/packages/Zygote/zowrf/src/compiler/interface2.jl:0
 [16] macro expansion
    @ ~/.julia/packages/Transformers/rCnGb/src/huggingface/models/gpt2.jl:413 [inlined]
 [17] _pullback
    @ ~/.julia/packages/Transformers/rCnGb/src/huggingface/models/gpt2.jl:326 [inlined]
 [18] _pullback(::Zygote.Context, ::Transformers.HuggingFace.HGFGPT2Model{12, Transformers.HuggingFace.FakeTHModuleList{12, NTuple{12, Transformers.HuggingFace.HGFGPT2Block{Transformers.HuggingFace.HGFGPT2Attention{Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}, Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}, Transformers.HuggingFace.HGFGPT2MLP{typeof(gelu), Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}, Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}}}}}, Transformers.HuggingFace.FakeTHEmbedding{Matrix{Float32}}, Transformers.HuggingFace.FakeTHEmbedding{Matrix{Float32}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}}, ::Matrix{Int64}, ::Nothing, ::Nothing, ::NTuple{12, Tuple{Array{Float32, 4}, Array{Float32, 4}}}, ::Nothing, ::Val{false}, ::Val{true}, ::Val{true})
    @ Zygote ~/.julia/packages/Zygote/zowrf/src/compiler/interface2.jl:0
 [19] _pullback
    @ ~/.julia/packages/Transformers/rCnGb/src/huggingface/models/gpt2.jl:473 [inlined]
 [20] _pullback(::Zygote.Context, ::Transformers.HuggingFace.HGFGPT2LMHeadModel{Transformers.HuggingFace.HGFGPT2Model{12, Transformers.HuggingFace.FakeTHModuleList{12, NTuple{12, Transformers.HuggingFace.HGFGPT2Block{Transformers.HuggingFace.HGFGPT2Attention{Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}, Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}, Transformers.HuggingFace.HGFGPT2MLP{typeof(gelu), Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}, Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}}}}}, Transformers.HuggingFace.FakeTHEmbedding{Matrix{Float32}}, Transformers.HuggingFace.FakeTHEmbedding{Matrix{Float32}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}}, Transformers.HuggingFace.FakeTHLinear{LinearAlgebra.Transpose{Float32, Matrix{Float32}}, Nothing}}, ::Matrix{Int64}, ::Nothing, ::Nothing, ::NTuple{12, Tuple{Array{Float32, 4}, Array{Float32, 4}}}, ::Nothing, ::Val{false}, ::Val{true}, ::Val{true})
    @ Zygote ~/.julia/packages/Zygote/zowrf/src/compiler/interface2.jl:0
 [21] _pullback
    @ ~/.julia/packages/Transformers/rCnGb/src/huggingface/models/gpt2.jl:449 [inlined]
 [22] _pullback(::Zygote.Context, ::Transformers.HuggingFace.var"##_#201", ::Nothing, ::Nothing, ::NTuple{12, Tuple{Array{Float32, 4}, Array{Float32, 4}}}, ::Nothing, ::Bool, ::Bool, ::Bool, ::Transformers.HuggingFace.HGFGPT2LMHeadModel{Transformers.HuggingFace.HGFGPT2Model{12, Transformers.HuggingFace.FakeTHModuleList{12, NTuple{12, Transformers.HuggingFace.HGFGPT2Block{Transformers.HuggingFace.HGFGPT2Attention{Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}, Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}, Transformers.HuggingFace.HGFGPT2MLP{typeof(gelu), Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}, Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}}}}}, Transformers.HuggingFace.FakeTHEmbedding{Matrix{Float32}}, Transformers.HuggingFace.FakeTHEmbedding{Matrix{Float32}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}}, Transformers.HuggingFace.FakeTHLinear{LinearAlgebra.Transpose{Float32, Matrix{Float32}}, Nothing}}, ::Matrix{Int64})
    @ Zygote ~/.julia/packages/Zygote/zowrf/src/compiler/interface2.jl:0
 [23] _pullback
    @ ~/.julia/packages/Transformers/rCnGb/src/huggingface/models/gpt2.jl:449 [inlined]
 [24] _pullback(::Zygote.Context, ::Core.var"#Any##kw", ::NamedTuple{(:past_key_values, :output_attentions, :output_hidden_states, :use_cache), Tuple{NTuple{12, Tuple{Array{Float32, 4}, Array{Float32, 4}}}, Bool, Bool, Bool}}, ::Transformers.HuggingFace.HGFGPT2LMHeadModel{Transformers.HuggingFace.HGFGPT2Model{12, Transformers.HuggingFace.FakeTHModuleList{12, NTuple{12, Transformers.HuggingFace.HGFGPT2Block{Transformers.HuggingFace.HGFGPT2Attention{Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}, Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}, Transformers.HuggingFace.HGFGPT2MLP{typeof(gelu), Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}, Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}}}}}, Transformers.HuggingFace.FakeTHEmbedding{Matrix{Float32}}, Transformers.HuggingFace.FakeTHEmbedding{Matrix{Float32}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}}, Transformers.HuggingFace.FakeTHLinear{LinearAlgebra.Transpose{Float32, Matrix{Float32}}, Nothing}}, ::Matrix{Int64})
    @ Zygote ~/.julia/packages/Zygote/zowrf/src/compiler/interface2.jl:0
 [25] _pullback
    @ ./In[8]:3 [inlined]
 [26] _pullback(::Zygote.Context, ::var"#1#2")
    @ Zygote ~/.julia/packages/Zygote/zowrf/src/compiler/interface2.jl:0
 [27] pullback(f::Function, ps::Params)
    @ Zygote ~/.julia/packages/Zygote/zowrf/src/compiler/interface.jl:250
 [28] top-level scope
    @ In[8]:2
 [29] eval
    @ ./boot.jl:360 [inlined]
 [30] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
    @ Base ./loading.jl:1094

I believe that this is something related to the ShiftAttentionMask.
Here is the MWE:

using Transformers.HuggingFace
using Zygote, Flux
model = hgf"gpt2:lmheadmodel"
tokens = reshape(Array(1:10),(:,1));
outputs = model(tokens[1:end-1, :]; position_ids=nothing, token_type_ids=nothing,
                                    past_key_values=nothing,
                                    attention_mask=nothing,
                                    output_attentions=true,
                                    output_hidden_states=true,
                                    use_cache=true);
past = outputs.past_key_values
prev = tokens[end:end, :];
ps = params(past)
_, back = Zygote.pullback(ps) do
    output_1 = model(prev; past_key_values=past,
                        output_attentions=false,
                        output_hidden_states=true,
                        use_cache=true);
    hidden = output_1.hidden_states[end]
    logits = model.lm_head(hidden)[:, end, :]
    logits[1]
end

Please let me know, if I am doing something wrong in this code.
cc @chengchingwen

Incorrect method signature with Embed and CUDA

I use this fantastic library for a small study in university.
Now, when I use a trainable embedding on the GPU, i get the following error:

ERROR: LoadError: MethodError: no method matching scatter_add!(::CuArray{Float32,2}, ::CuArray{Float32,2}, ::Array{Int64,1})
Closest candidates are:
  scatter_add!(::CuArray{T,2}, ::CuArray{T,N} where N, ::CuArray{Int64,N} where N) where T at /home/me/.julia/packages/Transformers/ko7g9/src/cuda/scatter_gpu.jl:3
  scatter_add!(::CuArray{T,N} where N, ::CuArray{T,N} where N, ::CuArray{var"#s62",N} where N where var"#s62"<:Tuple) where T at /home/me/.julia/packages/Transformers/ko7g9/src/cuda/scatter_gpu.jl:32
  scatter_add!(::Array{T,2}, ::Array{T,N} where N, ::Array{Int64,N} where N) where T at /home/me/.julia/packages/Transformers/ko7g9/src/fix/scatter.jl:2
Stacktrace:
 [1] ∇gather(::CuArray{Float32,2}, ::CuArray{Float32,2}, ::Array{Int64,1}) at /home/me/.julia/packages/Transformers/ko7g9/src/basic/embeds/gather.jl:41
 [2] (::Transformers.Basic.var"#33#34"{CuArray{Float32,2},Array{Int64,1}})(::CuArray{Float32,2}) at /home/me/.julia/packages/Transformers/ko7g9/src/basic/embeds/gather.jl:55
 [3] (::Transformers.Basic.var"#294#back#35"{Transformers.Basic.var"#33#34"{CuArray{Float32,2},Array{Int64,1}}})(::CuArray{Float32,2}) at /home/me/.julia/packages/ZygoteRules/OjfTt/src/adjoint.jl:59
 [4] Embed at /home/me/.julia/packages/Transformers/ko7g9/src/basic/embeds/embed.jl:25 [inlined]
 [5] (::typeof(∂(λ)))(::CuArray{Float32,2}) at /home/me/.julia/packages/Zygote/ggM8Z/src/compiler/interface2.jl:0
 [6] Embed at /home/me/.julia/packages/Transformers/ko7g9/src/basic/embeds/embed.jl:21 [inlined]
 [7] (::typeof(∂(λ)))(::CuArray{Float32,2}) at /home/me/.julia/packages/Zygote/ggM8Z/src/compiler/interface2.jl:0
 [8] WordPositionEmbed at /home/me/project/src/model/embed.jl:13 [inlined]
 [9] (::typeof(∂(λ)))(::CuArray{Float32,2}) at /home/me/.julia/packages/Zygote/ggM8Z/src/compiler/interface2.jl:0
 [10] TransformersModel at /home/me/project/src/model/transformers.jl:44 [inlined]
 [11] (::typeof(∂(λ)))(::CuArray{Float32,2}) at /home/me/.julia/packages/Zygote/ggM8Z/src/compiler/interface2.jl:0
 [12] loss at /home/me/project/src/training.jl:60 [inlined]
 [13] (::typeof(∂(loss)))(::Float32) at /home/me/.julia/packages/Zygote/ggM8Z/src/compiler/interface2.jl:0
 [14] #103 at /home/me/project/src/training.jl:99 [inlined]
 [15] (::typeof(∂(λ)))(::Float32) at /home/me/.julia/packages/Zygote/ggM8Z/src/compiler/interface2.jl:0
 [16] (::Zygote.var"#54#55"{Params,Zygote.Context,typeof(∂(λ))})(::Float32) at /home/me/.julia/packages/Zygote/ggM8Z/src/compiler/interface.jl:172
 [17] gradient(::Function, ::Params) at /home/me/.julia/packages/Zygote/ggM8Z/src/compiler/interface.jl:49
 [18] top-level scope at timing.jl:310
 [19] top-level scope at /home/me/project/src/training.jl:98
 [20] include(::Function, ::Module, ::String) at ./Base.jl:380
 [21] include(::Module, ::String) at ./Base.jl:368
 [22] exec_options(::Base.JLOptions) at ./client.jl:296
 [23] _start() at ./client.jl:506
in expression starting at /home/me/project/src/training.jl:84

I'm using Transformers 0.1.7 with Flux 0.11.1 on Julia 1.5.3.
It seems like the ∇gather function does not work with CuArrays (it also says the function is for CPU):

Transformers.jl/src/basic/embeds/gather.jl

Lines 38 to 43 in e7e7b74

 # cpu gather back 

 function ∇gather(Δ::AbstractArray{T}, w::AbstractMatrix{T}, xs::AbstractArray{Int}) where T 

 ys = fill!(similar(w), zero(T)) 

 scatter_add!(ys, Δ, xs) 

 return ys 

 end

Any help is highly appreciated! 👍

Segfault in embedding function

I'm following the example from the docs here, but I almost always get a segfault when I try to evaluate the embedding function. The dimension of the embedding seems to make the problem worse (ex: if I set value below to 512, it always segfaults. When it's 256, maybe 85% of the time.) This is on Julia 1.6.2. I would appreciate any suggestions!

MWE:

using Transformers
using Transformers.Basic
using Transformers.Pretrain

bert_model, wordpiece, tokenizer = pretrain"bert-uncased_L-12_H-768_A-12"
vocab = Vocabulary(wordpiece)
value = 256
pe = PositionEmbedding(value)

embed = Embed(value, 100)

function embedding(x)
we = embed(x, inv(sqrt(512)))
e = we .+ pe(we)
return e
end

v = [8739 2008]
embedding(v)

The error I get follows, if it's of any help:

signal (11): Segmentation fault: 11
in expression starting at /Users/austinbean/Desktop/programs/emr_nlp/seg_mwe.jl:24
getindex at ./array.jl:802 [inlined]
macro expansion at ./multidimensional.jl:860 [inlined]
macro expansion at ./cartesian.jl:64 [inlined]
_unsafe_getindex! at ./multidimensional.jl:855 [inlined]
_unsafe_getindex at ./multidimensional.jl:846
_getindex at ./multidimensional.jl:832 [inlined]
getindex at ./abstractarray.jl:1170 [inlined]
macro expansion at /Users/austinbean/.julia/packages/Transformers/3YgSd/src/basic/embeds/gather.jl:18 [inlined]
#272#threadsfor_fun at ./threadingconstructs.jl:81
#272#threadsfor_fun at ./threadingconstructs.jl:48
unknown function (ip: 0x113fa4c8c)
jl_apply_generic at /Applications/Julia-1.6.app/Contents/Resources/julia/lib/julia/libjulia-internal.1.6.dylib (unknown line)
start_task at /Applications/Julia-1.6.app/Contents/Resources/julia/lib/julia/libjulia-internal.1.6.dylib (unknown line)
Allocations: 43744691 (Pool: 43733847; Big: 10844); GC: 31
zsh: segmentation fault julia

scibert models missing loading_method?

I can load all the bert models but none of the scibert ones:

julia> bert_model, wordpiece, tokenizer = pretrain"bert-uncased_L-12_H-768_A-12"
[ Info: loading pretrain bert model: uncased_L-12_H-768_A-12.tfbson 
...

julia> bert_model, wordpiece, tokenizer = pretrain"scibert-scibert_scivocab_uncased"
ERROR: unknown pretrain type
Stacktrace:
 [1] error(s::String)
   @ Base ./error.jl:33
 [2] loading_method(x::Val{:scibert})
   @ Transformers.Pretrain ~/.julia/packages/Transformers/jtjKq/src/pretrain/Pretrain.jl:46
 [3] load_pretrain(str::String; kw::Base.Pairs{Symbol, Union{}, Tuple{}, NamedTuple{(), Tuple{}}})
   @ Transformers.Pretrain ~/.julia/packages/Transformers/jtjKq/src/pretrain/Pretrain.jl:58
 [4] load_pretrain(str::String)
   @ Transformers.Pretrain ~/.julia/packages/Transformers/jtjKq/src/pretrain/Pretrain.jl:57
 [5] top-level scope
   @ REPL[12]:1

Seems there is no loading_method for :scibert.

Basic.onehot

I try to run the tutorial, but something seems to be wrong with the onehot function call inside the definition of loss. Error message is below.

Help is appreciated. Thanks, Gerhard

[ Info: start training
ERROR: LoadError: MethodError: no method matching OneHotArray(::Transformers.Basic.OneHot{0x0000000d})
Closest candidates are:
OneHotArray(::Any, ::Transformers.Basic.OneHot) at /home/gjaeger/.julia/packages/Transformers/3YgSd/src/basic/embeds/onehot.jl:115
OneHotArray(::Any, ::Any) at /home/gjaeger/.julia/packages/Transformers/3YgSd/src/basic/embeds/onehot.jl:114
OneHotArray(::A) where {K, A<:(AbstractArray{Transformers.Basic.OneHot{K}, N} where N)} at /home/gjaeger/.julia/packages/Transformers/3YgSd/src/basic/embeds/onehot.jl:112
Stacktrace:
[1] OneHotArray(k::Int64, xs::Int64)
@ Transformers.Basic ~/.julia/packages/Transformers/3YgSd/src/basic/embeds/onehot.jl:114
[2] onehot(v::Vocabulary{Int64}, x::CuArray{Int64, 2, CUDA.Mem.DeviceBuffer})
@ Transformers.Basic ~/.julia/packages/Transformers/3YgSd/src/basic/embeds/vocab.jl:95
[3] _pullback
@ ~/.julia/packages/Zygote/l3aNG/src/lib/grad.jl:8 [inlined]
[4] _pullback
@ ~/projects/research/tryTransformer/code/tutorial.jl:99 [inlined]
[5] _pullback(::Zygote.Context, ::typeof(loss), ::CuArray{Int64, 2, CUDA.Mem.DeviceBuffer}, ::CuArray{Int64, 2, CUDA.Mem.DeviceBuffer})
@ Zygote ~/.julia/packages/Zygote/l3aNG/src/compiler/interface2.jl:0
[6] _pullback
@ ~/projects/research/tryTransformer/code/tutorial.jl:133 [inlined]
[7] _pullback(::Zygote.Context, ::var"#8#10")
@ Zygote ~/.julia/packages/Zygote/l3aNG/src/compiler/interface2.jl:0
[8] pullback(f::Function, ps::Zygote.Params)
@ Zygote ~/.julia/packages/Zygote/l3aNG/src/compiler/interface.jl:343
[9] gradient(f::Function, args::Zygote.Params)
@ Zygote ~/.julia/packages/Zygote/l3aNG/src/compiler/interface.jl:75
[10] train!()
@ Main ~/projects/research/tryTransformer/code/tutorial.jl:133

Transfer to the FluxML org?

Hi @chengchingwen - I am wondering if we should transfer this package to the FluxML (and also give you admin permissions to the org).

ckpt2json doesn't work well with /adam_m and /adam_v?

The function ckpt2bson may not work well; it might set "/adam_m" or "/adam_v" to dense layers and others., because some tensorflow check point data of BERT include entities ending with "/adam_m" or "/adam_v".

Thus, I suggest that functions recognize and deal with "/adam_m" or "/adam_v".

Suggestions

1. Change 2 functions
readckpt to transpose weights containing "/adam_m" or "/adam_v"
load_bert_from_tfbson to ignore weights containing "/adam_m" or "adam_v"

function readckpt(path)
...
      if length(shape) == 2 && !occursin("cls/seq_relationship/output_weights" , name) **# HERE**
        weight = collect(weight')
      end
...

function load_bert_from_tfbson(config, weights)
...
    #tf namespace handling
    weights = filter(x -> !occursin(r"/adam_[mv]", x.first), weights)  **# HERE**
    vnames = keys(weights)
...
end

2. Change function readckpt to continue for-loop when name contains "/adam_m" or "/adam_v"
this cannot output raw ckpt because "/adam_m" and "/adam_v" are removed from weights

function readckpt(path)
....
    for (name, shape) ∈ shapes
        occursin(r"/adam_[mv]", name) && continue  **# HERE**
        weight = ckpt.get_tensor(name)
...

IWSLT2016 link outdated

mentioned in #72.

The link we used for IWSLT2016 is outdated. We should use the new one. However, the dataset of each language pair used to be separate link and now the new one compress all language pairs into one tarball. Thus the DataDep would need to be rewritten.

how to use the IWSLT2016 dataset

Hi - I want to play around with some language translation tasks and saw that you've got Transformers.Datasets.IWSLT.IWSLT2016. How do I interact with this to get data that I can train a model on? I couldn't find anything in the documentation to help me out.

Julia version dependency?

I'm having a hard time getting everything working on julia 1.5. Is there a julia version dependency?

Tutorial throws a dimensionMismatch error on grad = gradient(()->loss(x, y), ps)

I copy pasted the tutorial into VScode and have been trying to run it for the past few days, to no avail.

all that happens is that it runs slowly for ages and then crashes with the following error:

ERROR: LoadError: DimensionMismatch("arrays could not be broadcast to a common size; got a dimension with lengths 0 and 11") Stacktrace: [1] _bcs1 @ .\broadcast.jl:501 [inlined] [2] _bcs(shape::Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}, newshape::Tuple{Base.OneTo{Int64}, Base.OneTo{Int64}}) (repeats 2 times) @ Base.Broadcast .\broadcast.jl:495 [3] broadcast_shape @ .\broadcast.jl:489 [inlined] [4] combine_axes @ .\broadcast.jl:484 [inlined] [5] instantiate @ .\broadcast.jl:266 [inlined] [6] materialize @ .\broadcast.jl:883 [inlined] [7] adjoint @ C:\Users\Jade\.julia\packages\Zygote\BCfwJ\src\lib\broadcast.jl:74 [inlined] [8] _pullback @ C:\Users\Jade\.julia\packages\ZygoteRules\AIbCs\src\adjoint.jl:65 [inlined] [9] _pullback @ C:\Users\Jade\.julia\packages\Transformers\V363g\src\basic\loss.jl:25 [inlined] [10] _pullback(::Zygote.Context, ::typeof(logkldivergence), ::Array{Float32, 3}, ::CuArray{Float32, 3, CUDA.Mem.DeviceBuffer}) @ Zygote C:\Users\Jade\.julia\packages\Zygote\BCfwJ\src\compiler\interface2.jl:0 [11] _pullback @ C:\Users\Jade\OneDrive\Documents\juliaStuff\test.jl:82 [inlined] [12] _pullback(::Zygote.Context, ::typeof(loss), ::CuArray{Int64, 2, CUDA.Mem.DeviceBuffer}, ::CuArray{Int64, 2, CUDA.Mem.DeviceBuffer}) @ Zygote C:\Users\Jade\.julia\packages\Zygote\BCfwJ\src\compiler\interface2.jl:0 [13] _pullback @ C:\Users\Jade\OneDrive\Documents\juliaStuff\test.jl:105 [inlined] [14] _pullback(::Zygote.Context, ::var"#4#6") @ Zygote C:\Users\Jade\.julia\packages\Zygote\BCfwJ\src\compiler\interface2.jl:0 [15] pullback(f::Function, ps::Zygote.Params) @ Zygote C:\Users\Jade\.julia\packages\Zygote\BCfwJ\src\compiler\interface.jl:338 [16] gradient(f::Function, args::Zygote.Params) @ Zygote C:\Users\Jade\.julia\packages\Zygote\BCfwJ\src\compiler\interface.jl:75 [17] train!() @ Main C:\Users\Jade\OneDrive\Documents\juliaStuff\test.jl:105 [18] top-level scope @ C:\Users\Jade\OneDrive\Documents\juliaStuff\test.jl:114

BERT CoLA Example Question

I am trying to train several BERT classifier models in one program but i am running out of GPU RAM by loading too many BERT models with const _bert_model, wordpiece, tokenizer = pretrain"Bert-uncased_L-12_H-768_A-12"

I am following the CoLA exmaple found here https://github.com/chengchingwen/Transformers.jl/blob/master/example/BERT/cola/train.jl

I am wondering if the train!() function found in the example trains all of the parameters shown in Flux.params(bert_model) or if it only trains those found in Flux.params(_bert_model.classifier). The reason why this is important is, if only the classifier parameters are modified instead of all bert model parameters, then I can load one pretrain"Bert-uncased_L-12_H-768_A-12" into RAM, instead of many, and then just train new classifiers ( _bert_model.classifier) for each bert classifier I need. This saves a lot of RAM of not loading in a new full BERT model for each bert classifier needed.

Please let me know if the whole bert model is trained with the train!() function or just the classifier parameters.

Thank you,

Jack

Update Compat

Hi,
when installing Transformers the following main downgrades happens: (this is in Julia 1.6, it works but not with the latest versions)

Flux v0.12.3 ⇒ v0.12.1
CUDA v3.2.1 ⇒ v2.6.3
DataStructures v0.18.9 ⇒ v0.17.20
Flux v0.12.3 ⇒ v0.12.1

any change to increase compatibility with the latest releases ?

cheers!

Support gpu with julia v1.6

Hope to support julia v1.6

Transformer not working with NeuralNetDiffEq

I am trying to develop a transformer model for NeuralNetDiffEq.
Seems like there is a dependency issue:
https://stackoverflow.com/questions/59765831/julia-flux-issue-undefvarerror-tracker-not-defined

[ Info: Precompiling Transformers [21ca0261-441d-5938-ace7-c90938fde4d4]
ERROR: LoadError: LoadError: UndefVarError: Tracker not defined
Stacktrace:
 [1] include(::Module, ::String) at .\Base.jl:377
 [2] include(::String) at C:\Users\zzj04\.julia\packages\Transformers\2BZJs\src\Transformers.jl:1
 [3] top-level scope at C:\Users\zzj04\.julia\packages\Transformers\2BZJs\src\Transformers.jl:33
 [4] include(::Module, ::String) at .\Base.jl:377
 [5] top-level scope at none:2
 [6] eval at .\boot.jl:331 [inlined]
 [7] eval(::Expr) at .\client.jl:449
 [8] top-level scope at .\none:3
in expression starting at C:\Users\zzj04\.julia\packages\Transformers\2BZJs\src\fix\batchedmul.jl:3
in expression starting at C:\Users\zzj04\.julia\packages\Transformers\2BZJs\src\Transformers.jl:33
ERROR: Failed to precompile Transformers [21ca0261-441d-5938-ace7-c90938fde4d4] to C:\Users\zzj04\.julia\compiled\v1.4\Transformers\gPQmm_KBlm6.ji.
Stacktrace:
 [1] error(::String) at .\error.jl:33
 [2] compilecache(::Base.PkgId, ::String) at .\loading.jl:1272
 [3] _require(::Base.PkgId) at .\loading.jl:1029
 [4] require(::Base.PkgId) at .\loading.jl:927
 [5] require(::Module, ::Symbol) at .\loading.jl:922

Creating a new Stacks.jl library

The Stacks structure introduced in this package (https://chengchingwen.github.io/Transformers.jl/dev/stacks/) is versatile enough that any multi-input multi-output model in the Julia ecosystem could potentially benefit. Opening this issue to suggest that it be split of into its own package.

MbedTLS error code -76: NET - Reading information from the socket failed

Using Transformers v0.1.19. The pretrain macro throws this error in Pretrain.jl. Specifically line 59. Datadeps failing. Any idea?

About choice of Tokenizers

@chengchingwen What do we do about the Huggingface Bert Tokenizer? Is that in the plan?

Download fails with pretrain macro on Linux

Thank you for this awesome package, I'm excited to use it. I can't figure out why I'm unable to download any pretrained models. I'm running the following very basic example on Pop! OS 21.XXX.

using Transformers
using Transformers.Basic
using Transformers.Pretrain

ENV["DATADEPS_ALWAYS_ACCEPT"] = true

bert_model, wordpiece, tokenizer = pretrain"bert-uncased_L-12_H-768_A-12"

I keep getting an error related to 'gcode', but I can't figure out what's going on - it seems like it's looking for a cookie and can't find it.
Error is this:

[ Info: retrying...
┌ Warning: gcode not found.
│   rq =
│    HTTP.Messages.Response:
│    """
│    HTTP/1.1 200 OK
│    Content-Type: text/html; charset=utf-8
│    Cache-Control: no-cache, no-store, max-age=0, must-revalidate
│    Pragma: no-cache
│    Expires: Mon, 01 Jan 1990 00:00:00 GMT
│    Date: Sun, 06 Mar 2022 19:57:54 GMT
│    Content-Length: 2219
│    Strict-Transport-Security: max-age=31536000
│    Report-To: {"group":"DriveUntrustedContentHttp","max_age":2592000,"endpoints":[{"url":"https://csp.withgoogle.com/csp/report-to/DriveUntrustedContentHttp/external"}]}
│    Content-Security-Policy: require-trusted-types-for 'script';report-uri /_/DriveUntrustedContentHttp/cspreport, script-src 'report-sample' 'nonce-z+pwOrMQK8lSU6CRZfoQOw' 'unsafe-inline';object-src 'none';base-uri 'self';report-uri /_/DriveUntrustedContentHttp/cspreport;worker-src 'self'
│    Cross-Origin-Opener-Policy: same-origin; report-to="DriveUntrustedContentHttp"
│    Server: ESF
│    X-XSS-Protection: 0
│    X-Frame-Options: SAMEORIGIN
│    X-Content-Type-Options: nosniff
│    Alt-Svc: h3=":443"; ma=2592000,h3-29=":443"; ma=2592000,h3-Q050=":443"; ma=2592000,h3-Q046=":443"; ma=2592000,h3-Q043=":443"; ma=2592000,quic=":443"; ma=2592000; v="46,43"
│    
│    """
└ @ Transformers.Datasets ~/.julia/packages/Transformers/ko7g9/src/datasets/download_utils.jl:67
ERROR: download failed
Stacktrace:
 [1] error(::String) at ./error.jl:33
 [2] download_gdrive(::String, ::String; retry::Bool, retries::Int64) at /home/swojcik/.julia/packages/Transformers/ko7g9/src/datasets/download_utils.jl:73
 [3] download_gdrive at /home/swojcik/.julia/packages/Transformers/ko7g9/src/datasets/download_utils.jl:55 [inlined]
 [4] run_fetch at /home/swojcik/.julia/packages/DataDeps/ooWXe/src/resolution_automatic.jl:99 [inlined]
 [5] download(::DataDeps.DataDep{String,String,typeof(Transformers.Datasets.download_gdrive),typeof(identity)}, ::String; remotepath::String, i_accept_the_terms_of_use::Nothing, skip_checksum::Bool) at /home/swojcik/.julia/packages/DataDeps/ooWXe/src/resolution_automatic.jl:78
 [6] download at /home/swojcik/.julia/packages/DataDeps/ooWXe/src/resolution_automatic.jl:70 [inlined]
 [7] handle_missing at /home/swojcik/.julia/packages/DataDeps/ooWXe/src/resolution_automatic.jl:10 [inlined]
 [8] _resolve(::DataDeps.DataDep{String,String,typeof(Transformers.Datasets.download_gdrive),typeof(identity)}, ::String) at /home/swojcik/.julia/packages/DataDeps/ooWXe/src/resolution.jl:83
 [9] resolve(::DataDeps.DataDep{String,String,typeof(Transformers.Datasets.download_gdrive),typeof(identity)}, ::String, ::String) at /home/swojcik/.julia/packages/DataDeps/ooWXe/src/resolution.jl:29
 [10] resolve(::String, ::String, ::String) at /home/swojcik/.julia/packages/DataDeps/ooWXe/src/resolution.jl:54
 [11] resolve at /home/swojcik/.julia/packages/DataDeps/ooWXe/src/resolution.jl:73 [inlined]
 [12] load_pretrain(::String; kw::Base.Iterators.Pairs{Union{},Union{},Tuple{},NamedTuple{(),Tuple{}}}) at /home/swojcik/.julia/packages/Transformers/ko7g9/src/pretrain/Pretrain.jl:59
 [13] load_pretrain(::String) at /home/swojcik/.julia/packages/Transformers/ko7g9/src/pretrain/Pretrain.jl:57
 [14] top-level scope at REPL[12]:1

Some plan

Here are some stuff I'm going to rewrite for the new release:

Tokenizer: Define the tokenizer with TextEncodeBase.jl and replace the old Basic.Vocabulary with TextEncodeBase.Vocab.
Layers: Rewrite the attention layer with NeuralAttentionlib.jl
Huggingface: Use HuggingFaceApi.jl for download and manage files, ~~and use StructWalk.jl to transform the state_dict~~. Remove the Pretrain submodule and use the huggingface one.

feel free to add comments.

Support for HuggingFace bert-base-portuguese-cased

Is it possible to support the neuralmind/bert-base-portuguese-cased? We are doing lots of portuguese NLP and this custom BERT model has better performance in our portuguese NLP problems.

Cc'ing a collaborator @RosalRicardo

Problem precompiling v0.1.14 on Julia 1.7.2

I can add without problems:

(@v1.7) pkg> add Transformers
    Updating registry at `~/.julia/registries/General.toml`
   Resolving package versions...
    Updating `~/.julia/environments/v1.7/Project.toml`
  [21ca0261] + Transformers v0.1.14
    Updating `~/.julia/environments/v1.7/Manifest.toml`
  [a4280ba5] + BytePairEncoding v0.2.0
  [bb354801] + Fetch v0.1.3
  [fbb45041] + Pickle v0.2.10
  [13d12f88] + PrimitiveOneHot v0.1.1
  [5e0ebb24] + Strided v1.2.2
  [21ca0261] + Transformers v0.1.14
  [9d95972d] + TupleTools v1.3.0

but then there is an ArgumentError on precompile:

julia> using Transformers
[ Info: Precompiling Transformers [21ca0261-441d-5938-ace7-c90938fde4d4]
WARNING: Method definition (::Type{Strided.StridedView{T, N, A, F} where F<:Union{typeof(Base.adjoint), typeof(Base.conj), typeof(Base.identity), typeof(Base.transpose)} where A<:(DenseArray{T, N} where N where T) where N where T})(Base.PermutedDimsArrays.PermutedDimsArray{T, N, perm, iperm, AA} where AA<:(AbstractArray{T, N} where N where T) where iperm) where {T, N, perm} in module Strided at /Users/feldt/.julia/packages/Strided/Af7gm/src/stridedview.jl:35 overwritten in module Torch at /Users/feldt/.julia/packages/Pickle/Ro6BR/src/torch/torch_save.jl:37.
  ** incremental compilation may be fatally broken for this module **

ERROR: LoadError: ArgumentError: Unsupported keyword argument 'config'
Stacktrace:
  [1] var"@cuda"(__source__::LineNumberNode, __module__::Module, ex::Vararg{Any})
    @ CUDA ~/.julia/packages/CUDA/fAEDi/src/compiler/execution.jl:47
  [2] include(mod::Module, _path::String)
    @ Base ./Base.jl:418
  [3] include(x::String)
    @ Transformers.HuggingFace ~/.julia/packages/Transformers/V363g/src/huggingface/HuggingFace.jl:1
...

This is on a MacBook so there is no CUDA. Maybe it doesn't make sense to use Transformers on this machine? Any input/advice welcome.

julia> versioninfo()
Julia Version 1.7.2
Commit bf53498635 (2022-02-06 15:21 UTC)
Platform Info:
  OS: macOS (arm64-apple-darwin21.2.0)
  CPU: Apple M1 Max
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-12.0.1 (ORCJIT, cyclone)
Environment:
  JULIA_NUM_THREADS = 8

Is target masking aligned correctly for loss function

In 1-model.jl loglkdivergence gets called with masking that does not mask out all padding tokens in the target sequence

for example, if I have a target sequence, t, with 2 for the start token, 3 for the end token and 1 for padding) and an predicted sequence p, along with target mask m
t = 2 a b c 3 1 1
m = 1 1 1 1 1 0 0
p = a b c 3 a a a

loglkdivergence gets called with t[2:end], m[1:end-1], p[1:end-1], which aligns like this
t = a b c 3 1 1
m = 1 1 1 1 1 0
p = a b c 3 a a

It would appear that the first predicted character past the end token is penalized, because it is not masked out. But should it be, since it corresponds to no character in the target?

bert_model.embed generates different vectors for same inputs

Hello, I'm trying to use yours BERT implementation for strings vectorizing. But I found that
bert_embedding = sample |> bert_model.embed generates different vectors per each call. Is it wrong usage from my side?

I'm just trying to use it in the manner of this package - https://github.com/JuliaText/Embeddings.jl but with BERT's specific embedding.

using Transformers
using Transformers.Basic
using Transformers.Pretrain
using Transformers.Datasets
using Transformers.BidirectionalEncoder

using Flux
using Flux: onehotbatch, gradient
import Flux.Optimise: update!
using WordTokenizers


ENV["DATADEPS_ALWAYS_ACCEPT"] = true
const FromScratch = false

#use wordpiece and tokenizer from pretrain
const wordpiece = pretrain"bert-uncased_L-12_H-768_A-12:wordpiece"
const tokenizer = pretrain"bert-uncased_L-12_H-768_A-12:tokenizer"
const vocab = Vocabulary(wordpiece)

#see model.jl
const bert_model = gpu(
  FromScratch ? create_bert() : pretrain"bert-uncased_L-12_H-768_A-12:bert_model"
)

function vectorize(str::String)
  tokens = str |> tokenizer |> wordpiece
  text = ["[CLS]"; tokens; "[SEP]"]
  token_indices = vocab(text)
  segment_indices = [fill(1, length(tokens) + 2);]
  sample = (tok = token_indices, segment = segment_indices)
  bert_embedding = sample |> bert_model.embed
  collect(sum(bert_embedding, dims=2)[:])
end

using Distances
x1 = vectorize("Some test text")
x2 = vectorize("Some test text")
cosine_dist(x1, x2) # !!!! >>0

using LinearAlgebra
LinearAlgebra.norm(x1 .- x2)  # !!!! >>0

Dimension mismatch error when passing past_key_values in HuggingFace gpt2 (attention_mask size not matching)

When I try to pass the precomputed past_key_values to HuggingFace gpt2 model, I am getting the following error:

DimensionMismatch("arrays could not be broadcast to a common size; got a dimension with lengths 20 and 10")

Stacktrace:
  [1] _bcs1
    @ ./broadcast.jl:501 [inlined]
  [2] _bcs(shape::NTuple{4, Base.OneTo{Int64}}, newshape::NTuple{4, Base.OneTo{Int64}})
    @ Base.Broadcast ./broadcast.jl:495
  [3] broadcast_shape
    @ ./broadcast.jl:489 [inlined]
  [4] combine_axes
    @ ./broadcast.jl:484 [inlined]
  [5] instantiate
    @ ./broadcast.jl:266 [inlined]
  [6] materialize
    @ ./broadcast.jl:883 [inlined]
  [7] _compute_attention_scores(query_layer::Array{Float32, 4}, key_layer::Array{Float32, 4}, attention_mask::Array{Float32, 4})
    @ Transformers.HuggingFace ~/.julia/packages/Transformers/UdOEB/src/huggingface/models/bert.jl:127
  [8] _attn(query::Array{Float32, 4}, key::Array{Float32, 4}, value::Array{Float32, 4}, attention_mask::Array{Float32, 4})
    @ Transformers.HuggingFace ~/.julia/packages/Transformers/UdOEB/src/huggingface/models/gpt2.jl:101
  [9] (::Transformers.HuggingFace.HGFGPT2Attention{Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}, Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}})(query::Array{Float32, 4}, key::Array{Float32, 4}, value::Array{Float32, 4}, attention_mask::Array{Float32, 4}, #unused#::Val{true}, #unused#::Val{true})
    @ Transformers.HuggingFace ~/.julia/packages/Transformers/UdOEB/src/huggingface/models/gpt2.jl:116
 [10] (::Transformers.HuggingFace.HGFGPT2Attention{Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}, Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}})(x::Array{Float32, 3}, past::Tuple{Array{Float32, 4}, Array{Float32, 4}}, attention_mask::Array{Float32, 4}, _output_attentions::Val{true}, _use_cache::Val{true})
    @ Transformers.HuggingFace ~/.julia/packages/Transformers/UdOEB/src/huggingface/models/gpt2.jl:97
 [11] (::Transformers.HuggingFace.HGFGPT2Block{Transformers.HuggingFace.HGFGPT2Attention{Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}, Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}, Transformers.HuggingFace.HGFGPT2MLP{typeof(NNlib.gelu), Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}, Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}}})(x::Array{Float32, 3}, past::Tuple{Array{Float32, 4}, Array{Float32, 4}}, attention_mask::Array{Float32, 4}, _output_attentions::Val{true}, _use_cache::Val{true})
    @ Transformers.HuggingFace ~/.julia/packages/Transformers/UdOEB/src/huggingface/models/gpt2.jl:202
 [12] macro expansion
    @ ~/.julia/packages/Transformers/UdOEB/src/huggingface/models/gpt2.jl:380 [inlined]
 [13] (::Transformers.HuggingFace.HGFGPT2Model{12, Transformers.HuggingFace.FakeTHModuleList{12, NTuple{12, Transformers.HuggingFace.HGFGPT2Block{Transformers.HuggingFace.HGFGPT2Attention{Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}, Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}, Transformers.HuggingFace.HGFGPT2MLP{typeof(NNlib.gelu), Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}, Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}}}}}, Transformers.HuggingFace.FakeTHEmbedding{Matrix{Float32}}, Transformers.HuggingFace.FakeTHEmbedding{Matrix{Float32}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}})(input::Matrix{Int64}, position_ids::Matrix{Int64}, token_type_ids::Matrix{Int64}, past::NTuple{12, Tuple{Array{Float32, 4}, Array{Float32, 4}}}, attention_mask::Nothing, _output_attentions::Val{true}, _output_hidden_states::Val{true}, _use_cache::Val{true})
    @ Transformers.HuggingFace ~/.julia/packages/Transformers/UdOEB/src/huggingface/models/gpt2.jl:298
 [14] (::Transformers.HuggingFace.HGFGPT2LMHeadModel{Transformers.HuggingFace.HGFGPT2Model{12, Transformers.HuggingFace.FakeTHModuleList{12, NTuple{12, Transformers.HuggingFace.HGFGPT2Block{Transformers.HuggingFace.HGFGPT2Attention{Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}, Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}, Transformers.HuggingFace.HGFGPT2MLP{typeof(NNlib.gelu), Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}, Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}}}}}, Transformers.HuggingFace.FakeTHEmbedding{Matrix{Float32}}, Transformers.HuggingFace.FakeTHEmbedding{Matrix{Float32}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}}, Transformers.HuggingFace.FakeTHLinear{Matrix{Float32}, Nothing}})(input::Matrix{Int64}, position_ids::Matrix{Int64}, token_type_ids::Matrix{Int64}, past::NTuple{12, Tuple{Array{Float32, 4}, Array{Float32, 4}}}, attention_mask::Nothing, _output_attentions::Val{true}, _output_hidden_states::Val{true}, _use_cache::Val{true})
    @ Transformers.HuggingFace ~/.julia/packages/Transformers/UdOEB/src/huggingface/models/gpt2.jl:440
 [15] (::Transformers.HuggingFace.HGFGPT2LMHeadModel{Transformers.HuggingFace.HGFGPT2Model{12, Transformers.HuggingFace.FakeTHModuleList{12, NTuple{12, Transformers.HuggingFace.HGFGPT2Block{Transformers.HuggingFace.HGFGPT2Attention{Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}, Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}, Transformers.HuggingFace.HGFGPT2MLP{typeof(NNlib.gelu), Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}, Transformers.HuggingFace.FakeHGFConv1D{Matrix{Float32}, Vector{Float32}}}}}}, Transformers.HuggingFace.FakeTHEmbedding{Matrix{Float32}}, Transformers.HuggingFace.FakeTHEmbedding{Matrix{Float32}}, Transformers.HuggingFace.FakeTHLayerNorm{Vector{Float32}}}, Transformers.HuggingFace.FakeTHLinear{Matrix{Float32}, Nothing}})(input::Matrix{Int64}; position_ids::Matrix{Int64}, token_type_ids::Matrix{Int64}, past_key_values::NTuple{12, Tuple{Array{Float32, 4}, Array{Float32, 4}}}, attention_mask::Nothing, output_attentions::Bool, output_hidden_states::Bool, use_cache::Bool)
    @ Transformers.HuggingFace ~/.julia/packages/Transformers/UdOEB/src/huggingface/models/gpt2.jl:416
 [16] top-level scope
    @ In[23]:1
 [17] eval
    @ ./boot.jl:360 [inlined]
 [18] include_string(mapexpr::typeof(REPL.softscope), mod::Module, code::String, filename::String)
    @ Base ./loading.jl:1094

I'm using Transformers 0.1.8 with Flux 0.11.6 on Julia 1.6.0
Minimal Code to replicate the above error:

using Transformers.HuggingFace
model = hgf"gpt2:lmheadmodel"
tokens = reshape(Array(1:10),(:,1));
outputs = model(tokens; position_ids=nothing, token_type_ids=nothing,
                                    past_key_values=nothing,
                                    attention_mask=nothing,
                                    output_attentions=true,
                                    output_hidden_states=true,
                                    use_cache=true);
output_new = model(tokens; position_ids=nothing, token_type_ids=nothing,
                                    past_key_values=outputs.past_key_values,
                                    attention_mask=nothing,
                                    output_attentions=true,
                                    output_hidden_states=true,
                                    use_cache=true);

I think it's because past key values are concatenated with present key values, making the shape [head_features, 2*seq_len, num_heads, batch]. However, the attention_mask that is being applied is with respect to the original shape

Transformers.jl/src/huggingface/models/gpt2.jl

Lines 93 to 97 in a013291

 past_key, past_value = past 

 key = hcat(past_key, key) 

 value = hcat(past_value, value) 

 self(query, key, value, attention_mask, _output_attentions, _use_cache)

Transformers.jl/src/huggingface/models/bert.jl

Lines 123 to 131 in a013291

 function _compute_attention_scores(query_layer, key_layer, attention_mask::Union{Nothing, <:AbstractArray}) 

 attentions_scores = batchedmul(key_layer, query_layer; transA = true) 

 attentions_scores = attentions_scores ./ convert(eltype(attentions_scores), sqrt(size(key_layer, 1))) 

 !isnothing(attention_mask) && 

 (attentions_scores = attentions_scores .+ attention_mask) 

 return attentions_scores 

 end

Here the attention_mask expects attention_scores shape to be [seq_len, seq_len, num_heads, batch] but after concatenation, it's getting [2*seq_len, seq_len, num_heads, batch]

@chengchingwen Can you please have a look into this?

	function PE(size, pos, i::Int)
	if rem(i, 2) == 0
	sin(pos/1e4^(i/size))
	else
	cos(pos/1e4^((i-1)/size))
	end
	end

	# cpu gather back
	function ∇gather(Δ::AbstractArray{T}, w::AbstractMatrix{T}, xs::AbstractArray{Int}) where T
	ys = fill!(similar(w), zero(T))
	scatter_add!(ys, Δ, xs)
	return ys
	end

	past_key, past_value = past
	key = hcat(past_key, key)
	value = hcat(past_value, value)

	self(query, key, value, attention_mask, _output_attentions, _use_cache)

	function _compute_attention_scores(query_layer, key_layer, attention_mask::Union{Nothing, <:AbstractArray})
	attentions_scores = batchedmul(key_layer, query_layer; transA = true)
	attentions_scores = attentions_scores ./ convert(eltype(attentions_scores), sqrt(size(key_layer, 1)))

	!isnothing(attention_mask) &&
	(attentions_scores = attentions_scores .+ attention_mask)

	return attentions_scores
	end