Bad optimization for continuous memory access with redundant early check
> What happens if you move the b0 check after the let/else and remove the first redundant check and `buf.get?`
Rust code:
fn u64_fetch3(buf: &[u8], pos: usize) -> bool {
let Some([b0, b1, b2, b3, b4, b5, b6, b7]) = buf.get(pos..(pos + 8)) else {
return false;
};
if *b0 != 0x78 {
return false;
}
let target = u64::from_le_bytes([*b0, *b1, *b2, *b3, *b4, *b5, *b6, *b7]);
target == 0x1234567812345678
}
> what happens if you include the test it in the match?
Rust code:
fn u64_fetch4(buf: &[u8], pos: usize) -> bool {
let Some([b0 @ 0x78, b1, b2, b3, b4, b5, b6, b7]) = buf.get(pos..(pos + 8)) else {
return false;
};
let target = u64::from_le_bytes([*b0, *b1, *b2, *b3, *b4, *b5, *b6, *b7]);
target == 0x1234567812345678
}
Both of the above Rust codes generate the same assembly:
u64_fetch3:
cmp rdx, -8
setae al
lea rcx, [rdx + 8]
cmp rcx, rsi
seta cl
or cl, al
jne .LBB4_1
cmp byte ptr [rdi + rdx], 120
jne .LBB4_1
movzx eax, byte ptr [rdi + rdx + 1]
movzx ecx, byte ptr [rdi + rdx + 2]
mov esi, dword ptr [rdi + rdx + 4]
shl rsi, 32
movzx edx, byte ptr [rdi + rdx + 3]
shl edx, 24
shl ecx, 16
shl eax, 8
or eax, ecx
or eax, edx
or rax, rsi
movabs rcx, 1311768465173140992
cmp rax, rcx
sete al
ret
Still five memory accesses.
> you can also give a name for the whole array to avoid repetition when passing to from_bytes
Thank you very much for this suggestion! But I rarely use this feature, could you give me an example about how to write such code?