Intel disaster.

I'm intrigued, what in that LTT video is wrong?
if anything, linus has the least amount of bias, especially in that video.....he just doesn't like the fact the snapdragon owned even the apple that says a lot......the snapdragon elite is no slow poke, imagine what it can do in a desktop with a proper GPU and support....it will decimate intel and amd.....as far as I am aware, those 12 cores are performance cores unlike intel with their e cores......
 
Just stop.

If what you are saying is true, then the compiled assembly for ARM would be longer than the equivalent x86 assembly. Except that isn't the case.

using the excellent godbolt site
https://godbolt.org

For the following C++ program:
C++:
#include <iostream>

int main() {
    long long sum = 0;
    int iterations = 1000000;

    for (int i = 0; i < iterations; ++i) {
        sum += i;
    }

    double average = static_cast<double>(sum) / iterations;
    std::cout << "Average: " << average << std::endl;

    return 0;
}

Using GCC for both.

x86 assembly:

Code:
main:
        push    {lr}
        movs    r2, #9
        movw    r1, #:lower16:.LANCHOR0
        movt    r1, #:upper16:.LANCHOR0
        sub     sp, sp, #12
        movw    r0, #:lower16:std::cout
        movt    r0, #:upper16:std::cout
        bl      std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, int)
        vldr.64 d0, .L8
        movw    r0, #:lower16:std::cout
        movt    r0, #:upper16:std::cout
        bl      std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<double>(double)
        ldr     r2, [r0]
        mov     r3, r0
        ldr     r2, [r2, #-12]
        add     r2, r2, r0
        ldr     r0, [r2, #124]
        cbz     r0, .L7
        ldrb    r2, [r0, #28]   @ zero_extendqisi2
        cbz     r2, .L3
        ldrb    r1, [r0, #39]   @ zero_extendqisi2
.L4:
        mov     r0, r3
        bl      std::basic_ostream<char, std::char_traits<char> >::put(char)
        bl      std::basic_ostream<char, std::char_traits<char> >::flush()
        movs    r0, #0
        add     sp, sp, #12
        ldr     pc, [sp], #4
.L3:
        strd    r0, r3, [sp]
        bl      std::ctype<char>::_M_widen_init() const
        ldr     r0, [sp]
        movs    r1, #10
        ldr     r2, [r0]
        ldr     r2, [r2, #24]
        blx     r2
        ldr     r3, [sp, #4]
        mov     r1, r0
        b       .L4
.L7:
        bl      std::__throw_bad_cast()
.L8:
        .word   0
        .word   1092519038

ARM assembly:
Code:
.LC0:
        .string "Average: "
main:
        mov     edx, 9
        sub     rsp, 24
        mov     esi, OFFSET FLAT:.LC0
        mov     edi, OFFSET FLAT:std::cout
        call    std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
        movsd   xmm0, QWORD PTR .LC1[rip]
        mov     edi, OFFSET FLAT:std::cout
        call    std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<double>(double)
        mov     rdx, rax
        mov     rax, QWORD PTR [rax]
        mov     rax, QWORD PTR [rax-24]
        mov     rdi, QWORD PTR [rdx+240+rax]
        test    rdi, rdi
        je      .L5
        cmp     BYTE PTR [rdi+56], 0
        je      .L3
        movzx   eax, BYTE PTR [rdi+67]
.L4:
        mov     rdi, rdx
        movsx   esi, al
        call    std::basic_ostream<char, std::char_traits<char> >::put(char)
        mov     rdi, rax
        call    std::basic_ostream<char, std::char_traits<char> >::flush()
        xor     eax, eax
        add     rsp, 24
        ret
.L3:
        mov     QWORD PTR [rsp+8], rdx
        mov     QWORD PTR [rsp], rdi
        call    std::ctype<char>::_M_widen_init() const
        mov     rdi, QWORD PTR [rsp]
        mov     esi, 10
        mov     rax, QWORD PTR [rdi]
        call    [QWORD PTR [rax+48]]
        mov     rdx, QWORD PTR [rsp+8]
        jmp     .L4
main.cold:
.LC1:
        .long   0
        .long   1092519038

ARM assembly is 45 lines
x86 assembly is 43 lines

Which actually means there is basically zero difference in terms of "complexity" of instructions between these instruction sets. (at least for a simple program like this.)


Primagen did a really high quality video on this going over some hackaday article saying how x86 needs to die. Point being is that both ARM and x86 instruction sets are actually quite complex nowadays.
Thank you. You are correct.
You are preaching to the choir my man. Most people on this forum don't comprehend much of what you wrote.
Most people never even coded ASM in their lives, let alone the simple program you provided.
I specifically didn't come here today to avoid having to respond here, so again thank you for the insightful response.
 
Thank you. You are correct.
You are preaching to the choir my man. Most people on this forum don't comprehend much of what you wrote.
Most people never even coded ASM in their lives, let alone the simple program you provided.
I specifically didn't come here today to avoid having to respond here, so again thank you for the insightful response.
I am just a software grug. And to be fair, I was also under the impression that that the "RISC" instruction set on ARM was very different to the "CISC" for a long time until I watched that Primagen video where they went through it and showed why it really wasn't as simple as "ARM is RISC, therefore it has less instructions".

I really find it hard to digest a statement saying that ARM isn't powerful enough when I can run a 13bn parameter LLM model on my laptop with pretty decent performance (10 tokens per second)
 
I am just a software grug. And to be fair, I was also under the impression that that the "RISC" instruction set on ARM was very different to the "CISC" for a long time until I watched that Primagen video where they went through it and showed why it really wasn't as simple as "ARM is RISC, therefore it has less instructions".

I really find it hard to digest a statement saying that ARM isn't powerful enough when I can run a 13bn parameter LLM model on my laptop with pretty decent performance (10 tokens per second)
The reality is that the majority here are just users, and have never touched anything besides their laptops/desktops bought from Carbonite and Incredible Connection.

Nobody believes me when I say I built a NAS storage unit for the home, with a NVIDIA Jetson Nano, and specifically not when I say that thing is as fast AF and reads/writes files as fast as the network allows. Er, what processor is in there? ARM of course.
The board was getting long in the tooth, Nvidia wanted too much money to get the nice stuff so I just lobbed Ubuntu 22.04LTS on it and enabled the usual software, and off it went.
 
The reality is that the majority here are just users, and have never touched anything besides their laptops/desktops bought from Carbonite and Incredible Connection.

Nobody believes me when I say I built a NAS storage unit for the home, with a NVIDIA Jetson Nano, and specifically not when I say that thing is as fast AF and reads/writes files as fast as the network allows. Er, what processor is in there? ARM of course.
The board was getting long in the tooth, Nvidia wanted too much money to get the nice stuff so I just lobbed Ubuntu 22.04LTS on it and enabled the usual software, and off it went.
I would say a Jetson Nano would be overkill for a NAS tbh fam. Those things are meant for neural network processing.

But I am actually fine with most people just being end users. And for the vast majority of end users, an ARM based laptop or desktop is fine for the job. Something like a Snapdragon X based Mac mini clone with more ram and storage could easily be cheaper and faster than the Intel/AMD equivalent.
 
Last edited:
Just stop.

If what you are saying is true, then the compiled assembly for ARM would be longer than the equivalent x86 assembly. Except that isn't the case.

using the excellent godbolt site
https://godbolt.org

For the following C++ program:
C++:
#include <iostream>

int main() {
    long long sum = 0;
    int iterations = 1000000;

    for (int i = 0; i < iterations; ++i) {
        sum += i;
    }

    double average = static_cast<double>(sum) / iterations;
    std::cout << "Average: " << average << std::endl;

    return 0;
}

Using GCC for both.

x86 assembly:

Code:
main:
        push    {lr}
        movs    r2, #9
        movw    r1, #:lower16:.LANCHOR0
        movt    r1, #:upper16:.LANCHOR0
        sub     sp, sp, #12
        movw    r0, #:lower16:std::cout
        movt    r0, #:upper16:std::cout
        bl      std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, int)
        vldr.64 d0, .L8
        movw    r0, #:lower16:std::cout
        movt    r0, #:upper16:std::cout
        bl      std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<double>(double)
        ldr     r2, [r0]
        mov     r3, r0
        ldr     r2, [r2, #-12]
        add     r2, r2, r0
        ldr     r0, [r2, #124]
        cbz     r0, .L7
        ldrb    r2, [r0, #28]   @ zero_extendqisi2
        cbz     r2, .L3
        ldrb    r1, [r0, #39]   @ zero_extendqisi2
.L4:
        mov     r0, r3
        bl      std::basic_ostream<char, std::char_traits<char> >::put(char)
        bl      std::basic_ostream<char, std::char_traits<char> >::flush()
        movs    r0, #0
        add     sp, sp, #12
        ldr     pc, [sp], #4
.L3:
        strd    r0, r3, [sp]
        bl      std::ctype<char>::_M_widen_init() const
        ldr     r0, [sp]
        movs    r1, #10
        ldr     r2, [r0]
        ldr     r2, [r2, #24]
        blx     r2
        ldr     r3, [sp, #4]
        mov     r1, r0
        b       .L4
.L7:
        bl      std::__throw_bad_cast()
.L8:
        .word   0
        .word   1092519038

ARM assembly:
Code:
.LC0:
        .string "Average: "
main:
        mov     edx, 9
        sub     rsp, 24
        mov     esi, OFFSET FLAT:.LC0
        mov     edi, OFFSET FLAT:std::cout
        call    std::basic_ostream<char, std::char_traits<char> >& std::__ostream_insert<char, std::char_traits<char> >(std::basic_ostream<char, std::char_traits<char> >&, char const*, long)
        movsd   xmm0, QWORD PTR .LC1[rip]
        mov     edi, OFFSET FLAT:std::cout
        call    std::basic_ostream<char, std::char_traits<char> >& std::basic_ostream<char, std::char_traits<char> >::_M_insert<double>(double)
        mov     rdx, rax
        mov     rax, QWORD PTR [rax]
        mov     rax, QWORD PTR [rax-24]
        mov     rdi, QWORD PTR [rdx+240+rax]
        test    rdi, rdi
        je      .L5
        cmp     BYTE PTR [rdi+56], 0
        je      .L3
        movzx   eax, BYTE PTR [rdi+67]
.L4:
        mov     rdi, rdx
        movsx   esi, al
        call    std::basic_ostream<char, std::char_traits<char> >::put(char)
        mov     rdi, rax
        call    std::basic_ostream<char, std::char_traits<char> >::flush()
        xor     eax, eax
        add     rsp, 24
        ret
.L3:
        mov     QWORD PTR [rsp+8], rdx
        mov     QWORD PTR [rsp], rdi
        call    std::ctype<char>::_M_widen_init() const
        mov     rdi, QWORD PTR [rsp]
        mov     esi, 10
        mov     rax, QWORD PTR [rdi]
        call    [QWORD PTR [rax+48]]
        mov     rdx, QWORD PTR [rsp+8]
        jmp     .L4
main.cold:
.LC1:
        .long   0
        .long   1092519038

ARM assembly is 45 lines
x86 assembly is 43 lines

Which actually means there is basically zero difference in terms of "complexity" of instructions between these instruction sets. (at least for a simple program like this.)


Primagen did a really high quality video on this going over some hackaday article saying how x86 needs to die. Point being is that both ARM and x86 instruction sets are actually quite complex nowadays.


REP MOVSB 👀
 
Top
Sign up to the MyBroadband newsletter
X