Get the high bits of the product of two overflowing integers. Like this, but more effi

Even simpler <div class="highlight highlight-source-julia notranslate position-rel

Doesn't seem to be correct? <div class="highlight highlight-source-julia notransla

For the performance difference, <div class="snippet-clipboard-content notranslate

I recall writing something similar: <d

Add `mul_hi` about julia HOT 14 OPEN

LilithHafner commented on August 28, 2024 1

Add `mul_hi`

from julia.

Comments (14)

LilithHafner commented on August 28, 2024 2

Even simpler

function mul_hi(a::UInt128, b::UInt128)
    a_lo = a % UInt64
    b_lo = b % UInt64
    a_hi = a >> 64
    b_hi = b >> 64
    m = a_lo * b_hi + a_hi * b_lo
    a_hi * b_hi + (m >> 64)
end

Stefan and my versions are identical and fewer instructions than Oscar's, but their runtime is similar. Pardon the superfluous debug info, I don't know how to remove it.

julia> @code_native debuginfo=:none mul_hi_osc(rand(UInt128), rand(UInt128))
        .text
        .file   "mul_hi_osc"
        .globl  julia_mul_hi_osc_71387          // -- Begin function julia_mul_hi_osc_71387
        .p2align        2
        .type   julia_mul_hi_osc_71387,@function
julia_mul_hi_osc_71387:                 // @julia_mul_hi_osc_71387
; Function Signature: mul_hi_osc(UInt128, UInt128)
// %bb.0:                               // %top
        //DEBUG_VALUE: mul_hi_osc:v <- [DW_OP_LLVM_fragment 64 64] $x3
        //DEBUG_VALUE: mul_hi_osc:v <- [DW_OP_LLVM_fragment 0 64] $x2
        //DEBUG_VALUE: mul_hi_osc:u <- [DW_OP_LLVM_fragment 64 64] $x1
        //DEBUG_VALUE: mul_hi_osc:u <- [DW_OP_LLVM_fragment 0 64] $x0
        stp     x29, x30, [sp, #-16]!           // 16-byte Folded Spill
        mov     x29, sp
        umulh   x9, x2, x0
        umulh   x10, x2, x1
        mul     x11, x2, x1
        adds    x9, x9, x11
        cinc    x10, x10, hs
        umulh   x11, x3, x0
        mul     x12, x3, x0
        and     x9, x9, #0xffffffffffff
        cmn     x9, x12
        cinc    x9, x11, hs
        umulh   x11, x3, x1
        mul     x12, x3, x1
        adds    x10, x10, x12
        cinc    x11, x11, hs
        adds    x9, x10, x9
        cinc    x10, x11, hs
        stp     x9, x10, [x8]
        ldp     x29, x30, [sp], #16             // 16-byte Folded Reload
        ret
.Lfunc_end0:
        .size   julia_mul_hi_osc_71387, .Lfunc_end0-julia_mul_hi_osc_71387
                                        // -- End function
        .type   ".L+Core.UInt128#71389",@object // @"+Core.UInt128#71389"
        .section        .rodata,"a",@progbits
        .p2align        3, 0x0
".L+Core.UInt128#71389":
        .xword  ".L+Core.UInt128#71389.jit"
        .size   ".L+Core.UInt128#71389", 8

.set ".L+Core.UInt128#71389.jit", 281469765451888
        .size   ".L+Core.UInt128#71389.jit", 8
        .section        ".note.GNU-stack","",@progbits

julia> @code_native debuginfo=:none mul_hi_stef(rand(UInt128), rand(UInt128))
        .text
        .file   "mul_hi_stef"
        .globl  julia_mul_hi_stef_71394         // -- Begin function julia_mul_hi_stef_71394
        .p2align        2
        .type   julia_mul_hi_stef_71394,@function
julia_mul_hi_stef_71394:                // @julia_mul_hi_stef_71394
; Function Signature: mul_hi_stef(UInt128, UInt128)
// %bb.0:                               // %top
        //DEBUG_VALUE: mul_hi_stef:b <- [DW_OP_LLVM_fragment 64 64] $x3
        //DEBUG_VALUE: mul_hi_stef:b <- [DW_OP_LLVM_fragment 0 64] $x2
        //DEBUG_VALUE: mul_hi_stef:a <- [DW_OP_LLVM_fragment 64 64] $x1
        //DEBUG_VALUE: mul_hi_stef:a <- [DW_OP_LLVM_fragment 0 64] $x0
        stp     x29, x30, [sp, #-16]!           // 16-byte Folded Spill
        mov     x29, sp
        umulh   x9, x3, x0
        mul     x10, x3, x0
        umulh   x11, x2, x1
        mul     x12, x2, x1
        cmn     x10, x12
        adc     x9, x9, x11
        umulh   x10, x3, x1
        mul     x11, x3, x1
        adds    x9, x9, x11
        cinc    x10, x10, hs
        stp     x9, x10, [x8]
        ldp     x29, x30, [sp], #16             // 16-byte Folded Reload
        ret
.Lfunc_end0:
        .size   julia_mul_hi_stef_71394, .Lfunc_end0-julia_mul_hi_stef_71394
                                        // -- End function
        .type   ".L+Core.UInt128#71396",@object // @"+Core.UInt128#71396"
        .section        .rodata,"a",@progbits
        .p2align        3, 0x0
".L+Core.UInt128#71396":
        .xword  ".L+Core.UInt128#71396.jit"
        .size   ".L+Core.UInt128#71396", 8

.set ".L+Core.UInt128#71396.jit", 281469765451888
        .size   ".L+Core.UInt128#71396.jit", 8
        .section        ".note.GNU-stack","",@progbits

julia> @code_native debuginfo=:none mul_hi_li(rand(UInt128), rand(UInt128))
        .text
        .file   "mul_hi_li"
        .globl  julia_mul_hi_li_71403           // -- Begin function julia_mul_hi_li_71403
        .p2align        2
        .type   julia_mul_hi_li_71403,@function
julia_mul_hi_li_71403:                  // @julia_mul_hi_li_71403
; Function Signature: mul_hi_li(UInt128, UInt128)
// %bb.0:                               // %top
        //DEBUG_VALUE: mul_hi_li:b <- [DW_OP_LLVM_fragment 64 64] $x3
        //DEBUG_VALUE: mul_hi_li:b <- [DW_OP_LLVM_fragment 0 64] $x2
        //DEBUG_VALUE: mul_hi_li:a <- [DW_OP_LLVM_fragment 64 64] $x1
        //DEBUG_VALUE: mul_hi_li:a <- [DW_OP_LLVM_fragment 0 64] $x0
        stp     x29, x30, [sp, #-16]!           // 16-byte Folded Spill
        mov     x29, sp
        umulh   x9, x3, x0
        mul     x10, x3, x0
        umulh   x11, x2, x1
        mul     x12, x2, x1
        cmn     x10, x12
        adc     x9, x9, x11
        umulh   x10, x3, x1
        mul     x11, x3, x1
        adds    x9, x9, x11
        cinc    x10, x10, hs
        stp     x9, x10, [x8]
        ldp     x29, x30, [sp], #16             // 16-byte Folded Reload
        ret
.Lfunc_end0:
        .size   julia_mul_hi_li_71403, .Lfunc_end0-julia_mul_hi_li_71403
                                        // -- End function
        .type   ".L+Core.UInt128#71405",@object // @"+Core.UInt128#71405"
        .section        .rodata,"a",@progbits
        .p2align        3, 0x0
".L+Core.UInt128#71405":
        .xword  ".L+Core.UInt128#71405.jit"
        .size   ".L+Core.UInt128#71405", 8

.set ".L+Core.UInt128#71405.jit", 281469765451888
        .size   ".L+Core.UInt128#71405.jit", 8
        .section        ".note.GNU-stack","",@progbits

from julia.

LilithHafner commented on August 28, 2024 2

You are correct. All of the implementations proposed so far are buggy. Stefan's and my adaptation fail both due to overflow when constructing m and due to ignoring the fact that the lo product might, when added to the discarded low bits of m, carry. I imagine Oscar's fails only from the latter based on its result, but I haven't checked. Here's a better one that should be a wee bit slower but still benchmarks equivalently

function mul_hi_li2(a::UInt128, b::UInt128)
    a_lo = a % UInt64
    b_lo = b % UInt64
    a_hi = a >> 64
    b_hi = b >> 64
    m0 = a_lo * b_hi + widemul(a_lo, b_lo) >> 64
    m, o = Base.add_with_overflow(m0, a_hi * b_lo)
    a_hi * b_hi + (m >> 64) + o * (UInt128(typemax(UInt64))+1)
end

julia> @code_native debuginfo=:none mul_hi_li2(rand(UInt128), rand(UInt128))
        .text
        .file   "mul_hi_li2"
        .globl  julia_mul_hi_li2_17676          // -- Begin function julia_mul_hi_li2_17676
        .p2align        2
        .type   julia_mul_hi_li2_17676,@function
julia_mul_hi_li2_17676:                 // @julia_mul_hi_li2_17676
; Function Signature: mul_hi_li2(UInt128, UInt128)
// %bb.0:                               // %top
        //DEBUG_VALUE: mul_hi_li2:b <- [DW_OP_LLVM_fragment 64 64] $x3
        //DEBUG_VALUE: mul_hi_li2:b <- [DW_OP_LLVM_fragment 0 64] $x2
        //DEBUG_VALUE: mul_hi_li2:a <- [DW_OP_LLVM_fragment 64 64] $x1
        //DEBUG_VALUE: mul_hi_li2:a <- [DW_OP_LLVM_fragment 0 64] $x0
        stp     x29, x30, [sp, #-16]!           // 16-byte Folded Spill
        mov     x29, sp
        umulh   x9, x3, x0
        mul     x10, x3, x0
        umulh   x11, x2, x0
        adds    x10, x11, x10
        cinc    x9, x9, hs
        umulh   x11, x2, x1
        mul     x12, x2, x1
        cmn     x10, x12
        adcs    x9, x9, x11
        umulh   x10, x3, x1
        mul     x11, x3, x1
        cset    x12, hs
        adds    x9, x9, x11
        adc     x10, x12, x10
        stp     x9, x10, [x8]
        ldp     x29, x30, [sp], #16             // 16-byte Folded Reload
        ret
.Lfunc_end0:
        .size   julia_mul_hi_li2_17676, .Lfunc_end0-julia_mul_hi_li2_17676
                                        // -- End function
        .type   ".L+Core.UInt128#17678",@object // @"+Core.UInt128#17678"
        .section        .rodata,"a",@progbits
        .p2align        3, 0x0
".L+Core.UInt128#17678":
        .xword  ".L+Core.UInt128#17678.jit"
        .size   ".L+Core.UInt128#17678", 8

.set ".L+Core.UInt128#17678.jit", 281472741900400
        .size   ".L+Core.UInt128#17678.jit", 8
        .section        ".note.GNU-stack","",@progbits

from julia.

oscardssmith commented on August 28, 2024 1

for machine size ints it works well, but for Int128 etc it doesn't do a great job. @code_native mul_hi(Int128(2), Int128(5)) fills up about 2 screens of code.

from julia.

sumiya11 commented on August 28, 2024 1

Doesn't seem to be correct?

julia> mul_hi_osc(typemax(UInt128), typemax(UInt128))
0xfffffffffffffffffffffffffffffffd

julia> mul_hi_stef(typemax(UInt128), typemax(UInt128))
0xfffffffffffffffefffffffffffffffd

julia> mul_hi_li(typemax(UInt128), typemax(UInt128))
0xfffffffffffffffefffffffffffffffd

julia> function mul_hi_true(a::T, b::T) where {T}
           ((widen(a)*b) >>> (sizeof(a)*8)) % T
       end
mul_hi_true (generic function with 1 method)

julia> mul_hi_true(typemax(UInt128), typemax(UInt128))
0xfffffffffffffffffffffffffffffffe

from julia.

NHDaly commented on August 28, 2024 1

This one is even one instruction shorter, but it seems to benchmark the same 😁
And the code is the simplest, and beautifully, it works for any BitInteger:

using BitIntegers: UInt256

_widemul(x, y) = widemul(x, y)
_widemul(x::UInt128, y::UInt128) = UInt256(x) * UInt256(y)

function mul_hi_3(x::T, y::T) where T <: Base.BitInteger
    xy = _widemul(x, y)  # Int128 -> Int256
    (xy >> 8sizeof(T)) % T
end

julia> @btime mul_hi_li2(UInt128(2), UInt128(3))
  0.792 ns (0 allocations: 0 bytes)
0x00000000000000000000000000000000

julia> @btime mul_hi_3(UInt128(2), UInt128(3))
  0.791 ns (0 allocations: 0 bytes)
0x00000000000000000000000000000000

julia> @code_native debuginfo=:none mul_hi_li2(UInt128(2), UInt128(3))
        .section        __TEXT,__text,regular,pure_instructions
        .build_version macos, 14, 0
        .globl  _julia_mul_hi_li2_847           ; -- Begin function julia_mul_hi_li2_847
        .p2align        2
_julia_mul_hi_li2_847:                  ; @julia_mul_hi_li2_847
; %bb.0:                                ; %top
        umulh   x9, x3, x0
        mul     x10, x3, x0
        umulh   x11, x2, x0
        adds    x10, x11, x10
        cinc    x9, x9, hs
        umulh   x11, x2, x1
        mul     x12, x2, x1
        cmn     x10, x12
        adcs    x9, x9, x11
        cset    w10, hs
        umulh   x11, x3, x1
        mul     x12, x3, x1
        cmp     w10, #0
        cset    x10, ne
        adds    x9, x9, x12
        adc     x10, x10, x11
        stp     x9, x10, [x8]
        ret
                                        ; -- End function
.subsections_via_symbols

julia> @code_native debuginfo=:none mul_hi_3(UInt128(2), UInt128(3))
        .section        __TEXT,__text,regular,pure_instructions
        .build_version macos, 14, 0
        .globl  _julia_mul_hi_3_849             ; -- Begin function julia_mul_hi_3_849
        .p2align        2
_julia_mul_hi_3_849:                    ; @julia_mul_hi_3_849
; %bb.0:                                ; %top
        umulh   x9, x3, x0
        umulh   x10, x2, x0
        mul     x11, x3, x0
        adds    x10, x11, x10
        cinc    x9, x9, hs
        umulh   x11, x2, x1
        mul     x12, x2, x1
        cmn     x12, x10
        cinc    x10, x11, hs
        adds    x9, x9, x10
        cset    w10, hs
        umulh   x11, x3, x1
        mul     x12, x3, x1
        adds    x9, x12, x9
        adc     x10, x11, x10
        stp     x9, x10, [x8]
        ret
                                        ; -- End function
.subsections_via_symbols

I have thought for a while that Julia should introduce an internal-only type for (U)Int128 to widen to, let's call them UInt128Widened and Int128Widened, so that operations that need to widen temporarily don't need to allocate a BigInt.
We don't want to introduce an actual Int256 type to widen to, since then that would need a 512-bit and it never ends, on and on.
But if we have this UInt128Widened, which is clearly documented to be internal-only, only supports a subset of operations, and therefore can't be used in place of a regular int (for example, don't support printing it at the REPL), then I think we could prevent people from trying to use it for anything other than temporarily widening Int128s! And in so doing, we could eliminate many BigInt multiplications.

For example, we just did exactly that in a FixedDecimals PR here: JuliaMath/FixedPointDecimals.jl#93

CC @Drvi

from julia.

oscardssmith commented on August 28, 2024 1

llvm's can perform analysis of integers of size 1 bit to 2^23 bits https://llvm.org/docs/LangRef.html#integer-type. See http://blog.llvm.org/2020/04/the-new-clang-extint-feature-provides.html and https://internals.rust-lang.org/t/expose-llvm-integer-intrinsics-for-arbitrarily-large-integers/18907 for discussion on potential future use in C and Rust respectively.

from julia.

oscardssmith commented on August 28, 2024

Adding good first issue not because it's a good absolutely first issue, but because it would be a good first issue for someone starting their path towards understanding Julia internals.

from julia.

StefanKarpinski commented on August 28, 2024

I would imagine that the above implementation would get optimized to a single instruction by LLVM most of the time, no? At least when I've written that in the past it's been ok. Indeed, on my M2 system mul_hi(123, 456) compiles down to a single smulh instruction.

from julia.

oscardssmith commented on August 28, 2024

For the performance difference,

function my_mul_hi(u::UInt128, v::UInt128)
    local u0::UInt128, v0::UInt128, w0::UInt128
    local u1::UInt128, v1::UInt128, w1::UInt128, w2::UInt128, t::UInt128

    u0 = u & typemax(UInt64); u1 = u >>> 64
    v0 = v & typemax(UInt64); v1 = v >>> 64
    w0 = u0 * v0
    t = u1 * v0 + (w0 >>> 64)
    w2 = t >>> 64
    w1 = u0 * v1 + (t & 0xffffffffffff)
    hi = u1 * v1 + w2 + (w1 >>> 64)
    return hi
end

julia> @btime my_mul_hi(x,y) setup = (x,y)=rand(UInt128, 2)
  2.042 ns (0 allocations: 0 bytes)

julia> @btime mul_hi(x,y) setup = (x,y)=rand(UInt128, 2)
  184.592 ns (12 allocations: 216 bytes)

from julia.

sumiya11 commented on August 28, 2024

I recall writing something similar:

julia/base/multinverses.jl

Line 138 in 4ee1022

function _mul_high(a::T, b::T) where {T<:Union{Signed, Unsigned}}

from julia.

StefanKarpinski commented on August 28, 2024

Wouldn't this be a reasonable implementation for UInt128?

function mul_hi(a::UInt128, b::UInt128)
    a_lo = a % UInt64
    b_lo = b % UInt64
    a_hi = (a >> 64) % UInt64
    b_hi = (b >> 64) % UInt64
    m = widemul(a_lo, b_hi) + widemul(a_hi, b_lo)
    widemul(a_hi, b_hi) + (m >> 64)
end

Can't quite decipher how that compares to yours...

from julia.

StefanKarpinski commented on August 28, 2024

Ah yes, that's very tidy!

from julia.

oscardssmith commented on August 28, 2024

I have thought for a while that Julia should introduce an internal-only type for (U)Int128 to widen to, let's call them UInt128Widened and Int128Widened, so that operations that need to widen temporarily don't need to allocate a BigInt.

If we were going to do this, we should go all the way and integrate LLVM BitIntegers and then it would work for arbitrary sizes.

from julia.

NHDaly commented on August 28, 2024

What does LLVM BitIntegers refer to?

from julia.

Add `mul_hi` about julia HOT 14 OPEN

Comments (14)

Related Issues (20)

Recommend Projects

React

Vue.js

Typescript

TensorFlow

Django

Laravel

D3

Recommend Topics

javascript

web

server

Machine learning

Visualization

Game

Recommend Org

Facebook

Microsoft

Google

Alibaba

D3

Tencent