Comments (14)
Even simpler
function mul_hi(a::UInt128, b::UInt128)
a_lo = a % UInt64
b_lo = b % UInt64
a_hi = a >> 64
b_hi = b >> 64
m = a_lo * b_hi + a_hi * b_lo
a_hi * b_hi + (m >> 64)
end
Stefan and my versions are identical and fewer instructions than Oscar's, but their runtime is similar. Pardon the superfluous debug info, I don't know how to remove it.
julia> @code_native debuginfo=:none mul_hi_osc(rand(UInt128), rand(UInt128))
.text
.file "mul_hi_osc"
.globl julia_mul_hi_osc_71387 // -- Begin function julia_mul_hi_osc_71387
.p2align 2
.type julia_mul_hi_osc_71387,@function
julia_mul_hi_osc_71387: // @julia_mul_hi_osc_71387
; Function Signature: mul_hi_osc(UInt128, UInt128)
// %bb.0: // %top
//DEBUG_VALUE: mul_hi_osc:v <- [DW_OP_LLVM_fragment 64 64] $x3
//DEBUG_VALUE: mul_hi_osc:v <- [DW_OP_LLVM_fragment 0 64] $x2
//DEBUG_VALUE: mul_hi_osc:u <- [DW_OP_LLVM_fragment 64 64] $x1
//DEBUG_VALUE: mul_hi_osc:u <- [DW_OP_LLVM_fragment 0 64] $x0
stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
mov x29, sp
umulh x9, x2, x0
umulh x10, x2, x1
mul x11, x2, x1
adds x9, x9, x11
cinc x10, x10, hs
umulh x11, x3, x0
mul x12, x3, x0
and x9, x9, #0xffffffffffff
cmn x9, x12
cinc x9, x11, hs
umulh x11, x3, x1
mul x12, x3, x1
adds x10, x10, x12
cinc x11, x11, hs
adds x9, x10, x9
cinc x10, x11, hs
stp x9, x10, [x8]
ldp x29, x30, [sp], #16 // 16-byte Folded Reload
ret
.Lfunc_end0:
.size julia_mul_hi_osc_71387, .Lfunc_end0-julia_mul_hi_osc_71387
// -- End function
.type ".L+Core.UInt128#71389",@object // @"+Core.UInt128#71389"
.section .rodata,"a",@progbits
.p2align 3, 0x0
".L+Core.UInt128#71389":
.xword ".L+Core.UInt128#71389.jit"
.size ".L+Core.UInt128#71389", 8
.set ".L+Core.UInt128#71389.jit", 281469765451888
.size ".L+Core.UInt128#71389.jit", 8
.section ".note.GNU-stack","",@progbits
julia> @code_native debuginfo=:none mul_hi_stef(rand(UInt128), rand(UInt128))
.text
.file "mul_hi_stef"
.globl julia_mul_hi_stef_71394 // -- Begin function julia_mul_hi_stef_71394
.p2align 2
.type julia_mul_hi_stef_71394,@function
julia_mul_hi_stef_71394: // @julia_mul_hi_stef_71394
; Function Signature: mul_hi_stef(UInt128, UInt128)
// %bb.0: // %top
//DEBUG_VALUE: mul_hi_stef:b <- [DW_OP_LLVM_fragment 64 64] $x3
//DEBUG_VALUE: mul_hi_stef:b <- [DW_OP_LLVM_fragment 0 64] $x2
//DEBUG_VALUE: mul_hi_stef:a <- [DW_OP_LLVM_fragment 64 64] $x1
//DEBUG_VALUE: mul_hi_stef:a <- [DW_OP_LLVM_fragment 0 64] $x0
stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
mov x29, sp
umulh x9, x3, x0
mul x10, x3, x0
umulh x11, x2, x1
mul x12, x2, x1
cmn x10, x12
adc x9, x9, x11
umulh x10, x3, x1
mul x11, x3, x1
adds x9, x9, x11
cinc x10, x10, hs
stp x9, x10, [x8]
ldp x29, x30, [sp], #16 // 16-byte Folded Reload
ret
.Lfunc_end0:
.size julia_mul_hi_stef_71394, .Lfunc_end0-julia_mul_hi_stef_71394
// -- End function
.type ".L+Core.UInt128#71396",@object // @"+Core.UInt128#71396"
.section .rodata,"a",@progbits
.p2align 3, 0x0
".L+Core.UInt128#71396":
.xword ".L+Core.UInt128#71396.jit"
.size ".L+Core.UInt128#71396", 8
.set ".L+Core.UInt128#71396.jit", 281469765451888
.size ".L+Core.UInt128#71396.jit", 8
.section ".note.GNU-stack","",@progbits
julia> @code_native debuginfo=:none mul_hi_li(rand(UInt128), rand(UInt128))
.text
.file "mul_hi_li"
.globl julia_mul_hi_li_71403 // -- Begin function julia_mul_hi_li_71403
.p2align 2
.type julia_mul_hi_li_71403,@function
julia_mul_hi_li_71403: // @julia_mul_hi_li_71403
; Function Signature: mul_hi_li(UInt128, UInt128)
// %bb.0: // %top
//DEBUG_VALUE: mul_hi_li:b <- [DW_OP_LLVM_fragment 64 64] $x3
//DEBUG_VALUE: mul_hi_li:b <- [DW_OP_LLVM_fragment 0 64] $x2
//DEBUG_VALUE: mul_hi_li:a <- [DW_OP_LLVM_fragment 64 64] $x1
//DEBUG_VALUE: mul_hi_li:a <- [DW_OP_LLVM_fragment 0 64] $x0
stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
mov x29, sp
umulh x9, x3, x0
mul x10, x3, x0
umulh x11, x2, x1
mul x12, x2, x1
cmn x10, x12
adc x9, x9, x11
umulh x10, x3, x1
mul x11, x3, x1
adds x9, x9, x11
cinc x10, x10, hs
stp x9, x10, [x8]
ldp x29, x30, [sp], #16 // 16-byte Folded Reload
ret
.Lfunc_end0:
.size julia_mul_hi_li_71403, .Lfunc_end0-julia_mul_hi_li_71403
// -- End function
.type ".L+Core.UInt128#71405",@object // @"+Core.UInt128#71405"
.section .rodata,"a",@progbits
.p2align 3, 0x0
".L+Core.UInt128#71405":
.xword ".L+Core.UInt128#71405.jit"
.size ".L+Core.UInt128#71405", 8
.set ".L+Core.UInt128#71405.jit", 281469765451888
.size ".L+Core.UInt128#71405.jit", 8
.section ".note.GNU-stack","",@progbits
from julia.
You are correct. All of the implementations proposed so far are buggy. Stefan's and my adaptation fail both due to overflow when constructing m
and due to ignoring the fact that the lo product might, when added to the discarded low bits of m
, carry. I imagine Oscar's fails only from the latter based on its result, but I haven't checked. Here's a better one that should be a wee bit slower but still benchmarks equivalently
function mul_hi_li2(a::UInt128, b::UInt128)
a_lo = a % UInt64
b_lo = b % UInt64
a_hi = a >> 64
b_hi = b >> 64
m0 = a_lo * b_hi + widemul(a_lo, b_lo) >> 64
m, o = Base.add_with_overflow(m0, a_hi * b_lo)
a_hi * b_hi + (m >> 64) + o * (UInt128(typemax(UInt64))+1)
end
julia> @code_native debuginfo=:none mul_hi_li2(rand(UInt128), rand(UInt128))
.text
.file "mul_hi_li2"
.globl julia_mul_hi_li2_17676 // -- Begin function julia_mul_hi_li2_17676
.p2align 2
.type julia_mul_hi_li2_17676,@function
julia_mul_hi_li2_17676: // @julia_mul_hi_li2_17676
; Function Signature: mul_hi_li2(UInt128, UInt128)
// %bb.0: // %top
//DEBUG_VALUE: mul_hi_li2:b <- [DW_OP_LLVM_fragment 64 64] $x3
//DEBUG_VALUE: mul_hi_li2:b <- [DW_OP_LLVM_fragment 0 64] $x2
//DEBUG_VALUE: mul_hi_li2:a <- [DW_OP_LLVM_fragment 64 64] $x1
//DEBUG_VALUE: mul_hi_li2:a <- [DW_OP_LLVM_fragment 0 64] $x0
stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
mov x29, sp
umulh x9, x3, x0
mul x10, x3, x0
umulh x11, x2, x0
adds x10, x11, x10
cinc x9, x9, hs
umulh x11, x2, x1
mul x12, x2, x1
cmn x10, x12
adcs x9, x9, x11
umulh x10, x3, x1
mul x11, x3, x1
cset x12, hs
adds x9, x9, x11
adc x10, x12, x10
stp x9, x10, [x8]
ldp x29, x30, [sp], #16 // 16-byte Folded Reload
ret
.Lfunc_end0:
.size julia_mul_hi_li2_17676, .Lfunc_end0-julia_mul_hi_li2_17676
// -- End function
.type ".L+Core.UInt128#17678",@object // @"+Core.UInt128#17678"
.section .rodata,"a",@progbits
.p2align 3, 0x0
".L+Core.UInt128#17678":
.xword ".L+Core.UInt128#17678.jit"
.size ".L+Core.UInt128#17678", 8
.set ".L+Core.UInt128#17678.jit", 281472741900400
.size ".L+Core.UInt128#17678.jit", 8
.section ".note.GNU-stack","",@progbits
from julia.
for machine size ints it works well, but for Int128
etc it doesn't do a great job. @code_native mul_hi(Int128(2), Int128(5))
fills up about 2 screens of code.
from julia.
Doesn't seem to be correct?
julia> mul_hi_osc(typemax(UInt128), typemax(UInt128))
0xfffffffffffffffffffffffffffffffd
julia> mul_hi_stef(typemax(UInt128), typemax(UInt128))
0xfffffffffffffffefffffffffffffffd
julia> mul_hi_li(typemax(UInt128), typemax(UInt128))
0xfffffffffffffffefffffffffffffffd
julia> function mul_hi_true(a::T, b::T) where {T}
((widen(a)*b) >>> (sizeof(a)*8)) % T
end
mul_hi_true (generic function with 1 method)
julia> mul_hi_true(typemax(UInt128), typemax(UInt128))
0xfffffffffffffffffffffffffffffffe
from julia.
This one is even one instruction shorter, but it seems to benchmark the same 😁
And the code is the simplest, and beautifully, it works for any BitInteger:
using BitIntegers: UInt256
_widemul(x, y) = widemul(x, y)
_widemul(x::UInt128, y::UInt128) = UInt256(x) * UInt256(y)
function mul_hi_3(x::T, y::T) where T <: Base.BitInteger
xy = _widemul(x, y) # Int128 -> Int256
(xy >> 8sizeof(T)) % T
end
julia> @btime mul_hi_li2(UInt128(2), UInt128(3))
0.792 ns (0 allocations: 0 bytes)
0x00000000000000000000000000000000
julia> @btime mul_hi_3(UInt128(2), UInt128(3))
0.791 ns (0 allocations: 0 bytes)
0x00000000000000000000000000000000
julia> @code_native debuginfo=:none mul_hi_li2(UInt128(2), UInt128(3))
.section __TEXT,__text,regular,pure_instructions
.build_version macos, 14, 0
.globl _julia_mul_hi_li2_847 ; -- Begin function julia_mul_hi_li2_847
.p2align 2
_julia_mul_hi_li2_847: ; @julia_mul_hi_li2_847
; %bb.0: ; %top
umulh x9, x3, x0
mul x10, x3, x0
umulh x11, x2, x0
adds x10, x11, x10
cinc x9, x9, hs
umulh x11, x2, x1
mul x12, x2, x1
cmn x10, x12
adcs x9, x9, x11
cset w10, hs
umulh x11, x3, x1
mul x12, x3, x1
cmp w10, #0
cset x10, ne
adds x9, x9, x12
adc x10, x10, x11
stp x9, x10, [x8]
ret
; -- End function
.subsections_via_symbols
julia> @code_native debuginfo=:none mul_hi_3(UInt128(2), UInt128(3))
.section __TEXT,__text,regular,pure_instructions
.build_version macos, 14, 0
.globl _julia_mul_hi_3_849 ; -- Begin function julia_mul_hi_3_849
.p2align 2
_julia_mul_hi_3_849: ; @julia_mul_hi_3_849
; %bb.0: ; %top
umulh x9, x3, x0
umulh x10, x2, x0
mul x11, x3, x0
adds x10, x11, x10
cinc x9, x9, hs
umulh x11, x2, x1
mul x12, x2, x1
cmn x12, x10
cinc x10, x11, hs
adds x9, x9, x10
cset w10, hs
umulh x11, x3, x1
mul x12, x3, x1
adds x9, x12, x9
adc x10, x11, x10
stp x9, x10, [x8]
ret
; -- End function
.subsections_via_symbols
I have thought for a while that Julia should introduce an internal-only type for (U)Int128 to widen to, let's call them UInt128Widened
and Int128Widened
, so that operations that need to widen temporarily don't need to allocate a BigInt.
We don't want to introduce an actual Int256
type to widen to, since then that would need a 512-bit and it never ends, on and on.
But if we have this UInt128Widened
, which is clearly documented to be internal-only, only supports a subset of operations, and therefore can't be used in place of a regular int (for example, don't support printing it at the REPL), then I think we could prevent people from trying to use it for anything other than temporarily widening Int128s! And in so doing, we could eliminate many BigInt multiplications.
For example, we just did exactly that in a FixedDecimals PR here: JuliaMath/FixedPointDecimals.jl#93
CC @Drvi
from julia.
llvm's can perform analysis of integers of size 1 bit to 2^23 bits https://llvm.org/docs/LangRef.html#integer-type. See http://blog.llvm.org/2020/04/the-new-clang-extint-feature-provides.html and https://internals.rust-lang.org/t/expose-llvm-integer-intrinsics-for-arbitrarily-large-integers/18907 for discussion on potential future use in C and Rust respectively.
from julia.
Adding good first issue not because it's a good absolutely first issue, but because it would be a good first issue for someone starting their path towards understanding Julia internals.
from julia.
I would imagine that the above implementation would get optimized to a single instruction by LLVM most of the time, no? At least when I've written that in the past it's been ok. Indeed, on my M2 system mul_hi(123, 456)
compiles down to a single smulh
instruction.
from julia.
For the performance difference,
function my_mul_hi(u::UInt128, v::UInt128)
local u0::UInt128, v0::UInt128, w0::UInt128
local u1::UInt128, v1::UInt128, w1::UInt128, w2::UInt128, t::UInt128
u0 = u & typemax(UInt64); u1 = u >>> 64
v0 = v & typemax(UInt64); v1 = v >>> 64
w0 = u0 * v0
t = u1 * v0 + (w0 >>> 64)
w2 = t >>> 64
w1 = u0 * v1 + (t & 0xffffffffffff)
hi = u1 * v1 + w2 + (w1 >>> 64)
return hi
end
julia> @btime my_mul_hi(x,y) setup = (x,y)=rand(UInt128, 2)
2.042 ns (0 allocations: 0 bytes)
julia> @btime mul_hi(x,y) setup = (x,y)=rand(UInt128, 2)
184.592 ns (12 allocations: 216 bytes)
from julia.
I recall writing something similar:
Line 138 in 4ee1022
from julia.
Wouldn't this be a reasonable implementation for UInt128
?
function mul_hi(a::UInt128, b::UInt128)
a_lo = a % UInt64
b_lo = b % UInt64
a_hi = (a >> 64) % UInt64
b_hi = (b >> 64) % UInt64
m = widemul(a_lo, b_hi) + widemul(a_hi, b_lo)
widemul(a_hi, b_hi) + (m >> 64)
end
Can't quite decipher how that compares to yours...
from julia.
Ah yes, that's very tidy!
from julia.
I have thought for a while that Julia should introduce an internal-only type for (U)Int128 to widen to, let's call them UInt128Widened and Int128Widened, so that operations that need to widen temporarily don't need to allocate a BigInt.
If we were going to do this, we should go all the way and integrate LLVM BitIntegers and then it would work for arbitrary sizes.
from julia.
What does LLVM BitIntegers
refer to?
from julia.
Related Issues (20)
- Ambiguous behavior of Tuple constructor with mismatched array length HOT 2
- RetentionParameterEstimator hits incorrect assert in emit_masked_bits_compare
- Monorepo Packages via Submodules/Subpackages: Support in Pkg and Julia Compiler HOT 14
- Circular dependency detected for Symbolics.jl on Julia 1.11.0-rc2 HOT 7
- [REPL] Autocompletion of `ModuleName.@macro_name` is broken
- [REPL] Autocompletion of `@benchmark A .= A setup=(A=` is broken
- Deadlock doing I/O on a foreign thread while the main thread is blocked HOT 5
- `Conditional` lattice element is semantically broken in the presence of SSAValues HOT 1
- `Base.summarysize(::Symbol)` returns zero
- Mention accuracy of `Base.summarysize` in docstring
- `__init__` side effects order is not defined, such that any global effects from `__init__` are a data race HOT 3
- Extension cycles are problematic (long `precompile` times, difficult to understand/resolve) HOT 4
- Add tests about layout of Int128
- Defining `sum(f, ::MyArray; dims)` breaks `sum(::MyArray)`
- Choose number of OpenBLAS threads based on process affinity HOT 6
- `Iterators.rest` gives wrong result for `AbstractRange` HOT 4
- `define_editor` should default to `wait=true` HOT 1
- Type instability when using intrinsics as higher order functions HOT 7
- Can't sum partially initialized non-isbits Hermitian or Symmetric matrix with Diagonal or SymTridiagonal HOT 2
- `--project=@script` is broken or NEWS entry is incorrect (Julia 1.11.0-rc3)
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from julia.