This is an attempt at implementing fetch_add
on floats without C++20.
void fetch_add(volatile float* x, float y)
{
bool success = false;
auto xi = (volatile std::int32_t*)x;
while(!success)
{
std::int32_t tmp = __atomic_load_n(xi, __ATOMIC_RELAXED);
auto sum = (float&)tmp + y;
success = __atomic_compare_exchange_n(xi, &tmp, (std::int32_t&)sum, true, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
}
}
void fetch_add_std(std::atomic<float>& x, float y)
{
x.fetch_add(y, std::memory_order_relaxed);
}
To my great confusion, when I compare the assembly, they differ.
fetch_add(float volatile*, float):
.L2:
mov eax, DWORD PTR [rdi]
movd xmm1, eax
addss xmm1, xmm0
movd edx, xmm1
lock cmpxchg DWORD PTR [rdi], edx
jne .L2
ret
fetch_add_std(std::atomic<float>&, float):
mov eax, DWORD PTR [rdi]
movaps xmm1, xmm0
movd xmm0, eax
mov DWORD PTR [rsp-4], eax
addss xmm0, xmm1
.L9:
mov eax, DWORD PTR [rsp-4]
movd edx, xmm0
lock cmpxchg DWORD PTR [rdi], edx
je .L6
mov DWORD PTR [rsp-4], eax
movss xmm0, DWORD PTR [rsp-4]
addss xmm0, xmm1
jmp .L9
.L6:
ret
我几乎没有汇编程序的能力,但是自定义版本对我来说似乎是正确的,这意味着它要么不正确,效率低下,要么就以某种方式破坏了标准库。我不太相信导致我问的第三种情况,自定义版本不正确还是效率低下?