浮点数上的custom和std fetch_add的差异

This is an attempt at implementing fetch_add on floats without C++20.

void fetch_add(volatile float* x, float y)
{
    bool success = false;
    auto xi = (volatile std::int32_t*)x;
    while(!success)
    {
        std::int32_t tmp = __atomic_load_n(xi, __ATOMIC_RELAXED);
        auto sum = (float&)tmp + y;
        success = __atomic_compare_exchange_n(xi, &tmp, (std::int32_t&)sum, true, __ATOMIC_RELAXED, __ATOMIC_RELAXED);
    }
}

void fetch_add_std(std::atomic<float>& x, float y)
{
    x.fetch_add(y, std::memory_order_relaxed);
}

To my great confusion, when I compare the assembly, they differ.

fetch_add(float volatile*, float):
.L2:
        mov     eax, DWORD PTR [rdi]
        movd    xmm1, eax
        addss   xmm1, xmm0
        movd    edx, xmm1
        lock cmpxchg    DWORD PTR [rdi], edx
        jne     .L2
        ret
fetch_add_std(std::atomic<float>&, float):
        mov     eax, DWORD PTR [rdi]
        movaps  xmm1, xmm0
        movd    xmm0, eax
        mov     DWORD PTR [rsp-4], eax
        addss   xmm0, xmm1
.L9:
        mov     eax, DWORD PTR [rsp-4]
        movd    edx, xmm0
        lock cmpxchg    DWORD PTR [rdi], edx
        je      .L6
        mov     DWORD PTR [rsp-4], eax
        movss   xmm0, DWORD PTR [rsp-4]
        addss   xmm0, xmm1
        jmp     .L9
.L6:
        ret

我几乎没有汇编程序的能力,但是自定义版本对我来说似乎是正确的,这意味着它要么不正确,效率低下,要么就以某种方式破坏了标准库。我不太相信导致我问的第三种情况,自定义版本不正确还是效率低下?

评论