6 releases
0.1.6 | Nov 25, 2022 |
---|---|
0.1.5 | Jan 22, 2022 |
#656 in Math
243 downloads per month
Used in 2 crates
(via iter_num_tools)
14KB
245 lines
array_bin_ops
An example implementation of Array Element-Wise Binary Operations in Rust.
Trying to produce efficient code where possible, avoiding any memory safety issues. Current benchmarks show it being faster than any safe code currently available (using std only)
Example ASM
Given the following rust code
pub fn add_i64x32(lhs: [i64; 32], rhs: [i64; 32]) -> [i64; 32] {
Array(lhs) + rhs
}
It outputs the following asm, which is performing 16 i64x2 add operations, in an unrolled loop to avoid branching.
add_i64x32:
sub rsp, 72
mov rax, rdi
movdqu xmm1, xmmword, ptr, [rsi]
movdqu xmm3, xmmword, ptr, [rsi, +, 16]
movdqu xmm5, xmmword, ptr, [rsi, +, 32]
movdqu xmm7, xmmword, ptr, [rsi, +, 48]
movdqu xmm15, xmmword, ptr, [rsi, +, 64]
movdqu xmm8, xmmword, ptr, [rsi, +, 80]
movdqu xmm9, xmmword, ptr, [rsi, +, 96]
movdqu xmm10, xmmword, ptr, [rsi, +, 112]
movdqu xmm14, xmmword, ptr, [rsi, +, 128]
movdqu xmm13, xmmword, ptr, [rsi, +, 144]
movdqu xmm12, xmmword, ptr, [rsi, +, 160]
movdqu xmm11, xmmword, ptr, [rsi, +, 176]
movups xmm0, xmmword, ptr, [rsi, +, 192]
movaps xmmword, ptr, [rsp], xmm0
movdqu xmm2, xmmword, ptr, [rsi, +, 208]
movups xmm0, xmmword, ptr, [rsi, +, 224]
movaps xmmword, ptr, [rsp, +, 48], xmm0
movdqu xmm0, xmmword, ptr, [rdx]
paddq xmm0, xmm1
movdqa xmmword, ptr, [rsp, +, 32], xmm0
movdqu xmm0, xmmword, ptr, [rdx, +, 16]
paddq xmm0, xmm3
movdqa xmmword, ptr, [rsp, +, 16], xmm0
movdqu xmm4, xmmword, ptr, [rdx, +, 32]
paddq xmm4, xmm5
movdqu xmm6, xmmword, ptr, [rdx, +, 48]
paddq xmm6, xmm7
movdqu xmm1, xmmword, ptr, [rdx, +, 64]
paddq xmm1, xmm15
movdqu xmm15, xmmword, ptr, [rdx, +, 80]
paddq xmm15, xmm8
movdqu xmm8, xmmword, ptr, [rdx, +, 96]
paddq xmm8, xmm9
movdqu xmm9, xmmword, ptr, [rdx, +, 112]
paddq xmm9, xmm10
movdqu xmm10, xmmword, ptr, [rdx, +, 128]
paddq xmm10, xmm14
movdqu xmm14, xmmword, ptr, [rdx, +, 144]
paddq xmm14, xmm13
movdqu xmm13, xmmword, ptr, [rdx, +, 160]
paddq xmm13, xmm12
movdqu xmm12, xmmword, ptr, [rdx, +, 176]
paddq xmm12, xmm11
movdqu xmm3, xmmword, ptr, [rdx, +, 192]
paddq xmm3, xmmword, ptr, [rsp]
movdqu xmm7, xmmword, ptr, [rdx, +, 208]
paddq xmm7, xmm2
movdqu xmm5, xmmword, ptr, [rdx, +, 224]
paddq xmm5, xmmword, ptr, [rsp, +, 48]
movdqu xmm11, xmmword, ptr, [rsi, +, 240]
movdqu xmm0, xmmword, ptr, [rdx, +, 240]
paddq xmm0, xmm11
movaps xmm2, xmmword, ptr, [rsp, +, 32]
movups xmmword, ptr, [rdi], xmm2
movaps xmm2, xmmword, ptr, [rsp, +, 16]
movups xmmword, ptr, [rdi, +, 16], xmm2
movdqu xmmword, ptr, [rdi, +, 32], xmm4
movdqu xmmword, ptr, [rdi, +, 48], xmm6
movdqu xmmword, ptr, [rdi, +, 64], xmm1
movdqu xmmword, ptr, [rdi, +, 80], xmm15
movdqu xmmword, ptr, [rdi, +, 96], xmm8
movdqu xmmword, ptr, [rdi, +, 112], xmm9
movdqu xmmword, ptr, [rdi, +, 128], xmm10
movdqu xmmword, ptr, [rdi, +, 144], xmm14
movdqu xmmword, ptr, [rdi, +, 160], xmm13
movdqu xmmword, ptr, [rdi, +, 176], xmm12
movdqu xmmword, ptr, [rdi, +, 192], xmm3
movdqu xmmword, ptr, [rdi, +, 208], xmm7
movdqu xmmword, ptr, [rdi, +, 224], xmm5
movdqu xmmword, ptr, [rdi, +, 240], xmm0
add rsp, 72
ret