.file "fma.c" .text .globl get_time .type get_time, @function get_time: .LFB4038: .cfi_startproc endbr64 pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 subq $32, %rsp movq %fs:40, %rax movq %rax, -8(%rbp) xorl %eax, %eax leaq -32(%rbp), %rax movl $0, %esi movq %rax, %rdi call gettimeofday@PLT movq -32(%rbp), %rax vcvtsi2sdq %rax, %xmm1, %xmm1 movq -24(%rbp), %rax vcvtsi2sdq %rax, %xmm2, %xmm2 vmovsd .LC0(%rip), %xmm0 vmulsd %xmm0, %xmm2, %xmm0 vaddsd %xmm0, %xmm1, %xmm0 movq -8(%rbp), %rax xorq %fs:40, %rax je .L3 call __stack_chk_fail@PLT .L3: leave .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE4038: .size get_time, .-get_time .section .rodata .LC1: .string "Result : %f\n" .LC2: .string "N : %d\n" .LC3: .string "Elapsed time : %f sec\n" .LC6: .string "Throughput : %.5f GFLOPS\n\n" .text .globl print_result .type print_result, @function print_result: .LFB4039: .cfi_startproc endbr64 pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 subq $16, %rsp vmovss %xmm0, -4(%rbp) vmovsd %xmm1, -16(%rbp) vcvtss2sd -4(%rbp), %xmm0, %xmm0 leaq .LC1(%rip), %rdi movl $1, %eax call printf@PLT movl $131072, %esi leaq .LC2(%rip), %rdi movl $0, %eax call printf@PLT movq -16(%rbp), %rax vmovq %rax, %xmm0 leaq .LC3(%rip), %rdi movl $1, %eax call printf@PLT vmovsd .LC4(%rip), %xmm0 vdivsd -16(%rbp), %xmm0, %xmm0 vmovsd .LC5(%rip), %xmm1 vdivsd %xmm1, %xmm0, %xmm0 leaq .LC6(%rip), %rdi movl $1, %eax call printf@PLT nop leave .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE4039: .size print_result, .-print_result .globl naive .type naive, @function naive: .LFB4040: .cfi_startproc endbr64 pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 subq $48, %rsp movq %rdi, -40(%rbp) movq %rsi, -48(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -24(%rbp) movl $0, %eax call get_time vmovq %xmm0, %rax movq %rax, -16(%rbp) movl $0, -20(%rbp) jmp .L6 .L7: movl -20(%rbp), %eax leaq 0(,%rax,4), %rdx movq -40(%rbp), %rax addq %rdx, %rax vmovss (%rax), %xmm1 movl -20(%rbp), %eax leaq 0(,%rax,4), %rdx movq -48(%rbp), %rax addq %rdx, %rax vmovss (%rax), %xmm0 vmulss %xmm0, %xmm1, %xmm0 vmovss -24(%rbp), %xmm1 vaddss %xmm0, %xmm1, %xmm0 vmovss %xmm0, -24(%rbp) addl $1, -20(%rbp) .L6: cmpl $131071, -20(%rbp) jbe .L7 movl $0, %eax call get_time vmovq %xmm0, %rax movq %rax, -8(%rbp) vmovsd -8(%rbp), %xmm0 vsubsd -16(%rbp), %xmm0, %xmm0 movl -24(%rbp), %eax vmovapd %xmm0, %xmm1 vmovd %eax, %xmm0 call print_result nop leave .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE4040: .size naive, .-naive .globl unroll .type unroll, @function unroll: .LFB4041: .cfi_startproc endbr64 pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 subq $80, %rsp movq %rdi, -72(%rbp) movq %rsi, -80(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -28(%rbp) vmovss -28(%rbp), %xmm0 vmovss %xmm0, -32(%rbp) vmovss -32(%rbp), %xmm0 vmovss %xmm0, -36(%rbp) vmovss -36(%rbp), %xmm0 vmovss %xmm0, -40(%rbp) vmovss -40(%rbp), %xmm0 vmovss %xmm0, -44(%rbp) vmovss -44(%rbp), %xmm0 vmovss %xmm0, -48(%rbp) vmovss -48(%rbp), %xmm0 vmovss %xmm0, -52(%rbp) vmovss -52(%rbp), %xmm0 vmovss %xmm0, -56(%rbp) movl $0, %eax call get_time vmovq %xmm0, %rax movq %rax, -16(%rbp) movl $0, -24(%rbp) jmp .L9 .L10: movl -24(%rbp), %eax sall $3, %eax movl %eax, %eax leaq 0(,%rax,4), %rdx movq -72(%rbp), %rax addq %rdx, %rax vmovss (%rax), %xmm1 movl -24(%rbp), %eax sall $3, %eax movl %eax, %eax leaq 0(,%rax,4), %rdx movq -80(%rbp), %rax addq %rdx, %rax vmovss (%rax), %xmm0 vmulss %xmm0, %xmm1, %xmm0 vmovss -56(%rbp), %xmm1 vaddss %xmm0, %xmm1, %xmm0 vmovss %xmm0, -56(%rbp) movl -24(%rbp), %eax sall $3, %eax addl $1, %eax movl %eax, %eax leaq 0(,%rax,4), %rdx movq -72(%rbp), %rax addq %rdx, %rax vmovss (%rax), %xmm1 movl -24(%rbp), %eax sall $3, %eax addl $1, %eax movl %eax, %eax leaq 0(,%rax,4), %rdx movq -80(%rbp), %rax addq %rdx, %rax vmovss (%rax), %xmm0 vmulss %xmm0, %xmm1, %xmm0 vmovss -52(%rbp), %xmm1 vaddss %xmm0, %xmm1, %xmm0 vmovss %xmm0, -52(%rbp) movl -24(%rbp), %eax sall $3, %eax addl $2, %eax movl %eax, %eax leaq 0(,%rax,4), %rdx movq -72(%rbp), %rax addq %rdx, %rax vmovss (%rax), %xmm1 movl -24(%rbp), %eax sall $3, %eax addl $2, %eax movl %eax, %eax leaq 0(,%rax,4), %rdx movq -80(%rbp), %rax addq %rdx, %rax vmovss (%rax), %xmm0 vmulss %xmm0, %xmm1, %xmm0 vmovss -48(%rbp), %xmm1 vaddss %xmm0, %xmm1, %xmm0 vmovss %xmm0, -48(%rbp) movl -24(%rbp), %eax sall $3, %eax addl $3, %eax movl %eax, %eax leaq 0(,%rax,4), %rdx movq -72(%rbp), %rax addq %rdx, %rax vmovss (%rax), %xmm1 movl -24(%rbp), %eax sall $3, %eax addl $3, %eax movl %eax, %eax leaq 0(,%rax,4), %rdx movq -80(%rbp), %rax addq %rdx, %rax vmovss (%rax), %xmm0 vmulss %xmm0, %xmm1, %xmm0 vmovss -44(%rbp), %xmm1 vaddss %xmm0, %xmm1, %xmm0 vmovss %xmm0, -44(%rbp) movl -24(%rbp), %eax sall $3, %eax addl $4, %eax movl %eax, %eax leaq 0(,%rax,4), %rdx movq -72(%rbp), %rax addq %rdx, %rax vmovss (%rax), %xmm1 movl -24(%rbp), %eax sall $3, %eax addl $4, %eax movl %eax, %eax leaq 0(,%rax,4), %rdx movq -80(%rbp), %rax addq %rdx, %rax vmovss (%rax), %xmm0 vmulss %xmm0, %xmm1, %xmm0 vmovss -40(%rbp), %xmm1 vaddss %xmm0, %xmm1, %xmm0 vmovss %xmm0, -40(%rbp) movl -24(%rbp), %eax sall $3, %eax addl $5, %eax movl %eax, %eax leaq 0(,%rax,4), %rdx movq -72(%rbp), %rax addq %rdx, %rax vmovss (%rax), %xmm1 movl -24(%rbp), %eax sall $3, %eax addl $5, %eax movl %eax, %eax leaq 0(,%rax,4), %rdx movq -80(%rbp), %rax addq %rdx, %rax vmovss (%rax), %xmm0 vmulss %xmm0, %xmm1, %xmm0 vmovss -36(%rbp), %xmm1 vaddss %xmm0, %xmm1, %xmm0 vmovss %xmm0, -36(%rbp) movl -24(%rbp), %eax sall $3, %eax addl $6, %eax movl %eax, %eax leaq 0(,%rax,4), %rdx movq -72(%rbp), %rax addq %rdx, %rax vmovss (%rax), %xmm1 movl -24(%rbp), %eax sall $3, %eax addl $6, %eax movl %eax, %eax leaq 0(,%rax,4), %rdx movq -80(%rbp), %rax addq %rdx, %rax vmovss (%rax), %xmm0 vmulss %xmm0, %xmm1, %xmm0 vmovss -32(%rbp), %xmm1 vaddss %xmm0, %xmm1, %xmm0 vmovss %xmm0, -32(%rbp) movl -24(%rbp), %eax sall $3, %eax addl $7, %eax movl %eax, %eax leaq 0(,%rax,4), %rdx movq -72(%rbp), %rax addq %rdx, %rax vmovss (%rax), %xmm1 movl -24(%rbp), %eax sall $3, %eax addl $7, %eax movl %eax, %eax leaq 0(,%rax,4), %rdx movq -80(%rbp), %rax addq %rdx, %rax vmovss (%rax), %xmm0 vmulss %xmm0, %xmm1, %xmm0 vmovss -28(%rbp), %xmm1 vaddss %xmm0, %xmm1, %xmm0 vmovss %xmm0, -28(%rbp) addl $1, -24(%rbp) .L9: cmpl $16383, -24(%rbp) jbe .L10 vmovss -56(%rbp), %xmm0 vaddss -52(%rbp), %xmm0, %xmm0 vaddss -48(%rbp), %xmm0, %xmm0 vaddss -44(%rbp), %xmm0, %xmm0 vaddss -40(%rbp), %xmm0, %xmm0 vaddss -36(%rbp), %xmm0, %xmm0 vaddss -32(%rbp), %xmm0, %xmm0 vmovss -28(%rbp), %xmm1 vaddss %xmm0, %xmm1, %xmm0 vmovss %xmm0, -20(%rbp) movl $0, %eax call get_time vmovq %xmm0, %rax movq %rax, -8(%rbp) vmovsd -8(%rbp), %xmm0 vsubsd -16(%rbp), %xmm0, %xmm0 movl -20(%rbp), %eax vmovapd %xmm0, %xmm1 vmovd %eax, %xmm0 call print_result nop leave .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE4041: .size unroll, .-unroll .globl vector .type vector, @function vector: .LFB4042: .cfi_startproc endbr64 leaq 8(%rsp), %r10 .cfi_def_cfa 10, 0 andq $-32, %rsp pushq -8(%r10) pushq %rbp movq %rsp, %rbp .cfi_escape 0x10,0x6,0x2,0x76,0 pushq %r10 .cfi_escape 0xf,0x3,0x76,0x78,0x6 subq $328, %rsp movq %rdi, -328(%rbp) movq %rsi, -336(%rbp) movq %fs:40, %rax movq %rax, -24(%rbp) xorl %eax, %eax vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -304(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -300(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -296(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -292(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -288(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -284(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -280(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -276(%rbp) vmovss -304(%rbp), %xmm1 vmovss -300(%rbp), %xmm0 vunpcklps %xmm1, %xmm0, %xmm2 vmovss -296(%rbp), %xmm1 vmovss -292(%rbp), %xmm0 vunpcklps %xmm1, %xmm0, %xmm1 vmovss -288(%rbp), %xmm3 vmovss -284(%rbp), %xmm0 vunpcklps %xmm3, %xmm0, %xmm3 vmovss -280(%rbp), %xmm4 vmovss -276(%rbp), %xmm0 vunpcklps %xmm4, %xmm0, %xmm0 vmovlhps %xmm3, %xmm0, %xmm0 vmovlhps %xmm2, %xmm1, %xmm1 vinsertf128 $0x1, %xmm1, %ymm0, %ymm0 vmovaps %ymm0, -240(%rbp) movl $0, %eax call get_time vmovq %xmm0, %rax movq %rax, -272(%rbp) movl $0, -312(%rbp) jmp .L13 .L17: movl -312(%rbp), %eax sall $3, %eax movl %eax, %eax leaq 0(,%rax,4), %rdx movq -328(%rbp), %rax addq %rdx, %rax movq %rax, -248(%rbp) movq -248(%rbp), %rax vmovaps (%rax), %ymm0 vmovaps %ymm0, -208(%rbp) movl -312(%rbp), %eax sall $3, %eax movl %eax, %eax leaq 0(,%rax,4), %rdx movq -336(%rbp), %rax addq %rdx, %rax movq %rax, -256(%rbp) movq -256(%rbp), %rax vmovaps (%rax), %ymm0 vmovaps %ymm0, -176(%rbp) vmovaps -240(%rbp), %ymm0 vmovaps -208(%rbp), %ymm1 vmovaps %ymm1, -144(%rbp) vmovaps -176(%rbp), %ymm1 vmovaps %ymm1, -112(%rbp) vmovaps %ymm0, -80(%rbp) vmovaps -112(%rbp), %ymm1 vmovaps -80(%rbp), %ymm0 vfmadd231ps -144(%rbp), %ymm1, %ymm0 nop vmovaps %ymm0, -240(%rbp) addl $1, -312(%rbp) .L13: cmpl $16383, -312(%rbp) jbe .L17 vmovss -240(%rbp), %xmm1 vmovss -236(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -232(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -228(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -224(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -220(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -216(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -212(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm0 vmovss %xmm0, -308(%rbp) movl $0, %eax call get_time vmovq %xmm0, %rax movq %rax, -264(%rbp) vmovsd -264(%rbp), %xmm0 vsubsd -272(%rbp), %xmm0, %xmm0 movl -308(%rbp), %eax vmovapd %xmm0, %xmm1 vmovd %eax, %xmm0 call print_result nop movq -24(%rbp), %rax xorq %fs:40, %rax je .L18 call __stack_chk_fail@PLT .L18: addq $328, %rsp popq %r10 .cfi_def_cfa 10, 0 popq %rbp leaq -8(%r10), %rsp .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE4042: .size vector, .-vector .globl unroll_vector .type unroll_vector, @function unroll_vector: .LFB4043: .cfi_startproc endbr64 leaq 8(%rsp), %r10 .cfi_def_cfa 10, 0 andq $-32, %rsp pushq -8(%r10) pushq %rbp movq %rsp, %rbp .cfi_escape 0x10,0x6,0x2,0x76,0 pushq %r10 .cfi_escape 0xf,0x3,0x76,0x78,0x6 subq $1864, %rsp movq %rdi, -1848(%rbp) movq %rsi, -1856(%rbp) movq %fs:40, %rax movq %rax, -24(%rbp) xorl %eax, %eax vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -1760(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -1756(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -1752(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -1748(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -1744(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -1740(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -1736(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -1732(%rbp) vmovss -1760(%rbp), %xmm1 vmovss -1756(%rbp), %xmm0 vunpcklps %xmm1, %xmm0, %xmm2 vmovss -1752(%rbp), %xmm1 vmovss -1748(%rbp), %xmm0 vunpcklps %xmm1, %xmm0, %xmm1 vmovss -1744(%rbp), %xmm3 vmovss -1740(%rbp), %xmm0 vunpcklps %xmm3, %xmm0, %xmm3 vmovss -1736(%rbp), %xmm4 vmovss -1732(%rbp), %xmm0 vunpcklps %xmm4, %xmm0, %xmm0 vmovlhps %xmm3, %xmm0, %xmm0 vmovlhps %xmm2, %xmm1, %xmm1 vinsertf128 $0x1, %xmm1, %ymm0, %ymm0 vmovaps %ymm0, -1488(%rbp) vmovaps -1488(%rbp), %ymm0 vmovaps %ymm0, -1520(%rbp) vmovaps -1520(%rbp), %ymm0 vmovaps %ymm0, -1552(%rbp) vmovaps -1552(%rbp), %ymm0 vmovaps %ymm0, -1584(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -1792(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -1788(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -1784(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -1780(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -1776(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -1772(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -1768(%rbp) vxorps %xmm0, %xmm0, %xmm0 vmovss %xmm0, -1764(%rbp) vmovss -1792(%rbp), %xmm1 vmovss -1788(%rbp), %xmm0 vunpcklps %xmm1, %xmm0, %xmm2 vmovss -1784(%rbp), %xmm1 vmovss -1780(%rbp), %xmm0 vunpcklps %xmm1, %xmm0, %xmm1 vmovss -1776(%rbp), %xmm3 vmovss -1772(%rbp), %xmm0 vunpcklps %xmm3, %xmm0, %xmm3 vmovss -1768(%rbp), %xmm4 vmovss -1764(%rbp), %xmm0 vunpcklps %xmm4, %xmm0, %xmm0 vmovlhps %xmm3, %xmm0, %xmm0 vmovlhps %xmm2, %xmm1, %xmm1 vinsertf128 $0x1, %xmm1, %ymm0, %ymm0 vmovaps %ymm0, -1360(%rbp) vmovaps -1360(%rbp), %ymm0 vmovaps %ymm0, -1392(%rbp) vmovaps -1392(%rbp), %ymm0 vmovaps %ymm0, -1424(%rbp) vmovaps -1424(%rbp), %ymm0 vmovaps %ymm0, -1456(%rbp) movl $0, %eax call get_time vmovq %xmm0, %rax movq %rax, -1728(%rbp) movl $0, -1832(%rbp) jmp .L22 .L47: movl -1832(%rbp), %eax sall $6, %eax movl %eax, %eax leaq 0(,%rax,4), %rdx movq -1848(%rbp), %rax addq %rdx, %rax movq %rax, -1592(%rbp) movq -1592(%rbp), %rax vmovaps (%rax), %ymm0 vmovaps %ymm0, -1328(%rbp) movl -1832(%rbp), %eax sall $6, %eax movl %eax, %eax leaq 0(,%rax,4), %rdx movq -1856(%rbp), %rax addq %rdx, %rax movq %rax, -1600(%rbp) movq -1600(%rbp), %rax vmovaps (%rax), %ymm0 vmovaps %ymm0, -1296(%rbp) movl -1832(%rbp), %eax sall $6, %eax movl %eax, %eax addq $8, %rax leaq 0(,%rax,4), %rdx movq -1848(%rbp), %rax addq %rdx, %rax movq %rax, -1608(%rbp) movq -1608(%rbp), %rax vmovaps (%rax), %ymm0 vmovaps %ymm0, -1264(%rbp) movl -1832(%rbp), %eax sall $6, %eax movl %eax, %eax addq $8, %rax leaq 0(,%rax,4), %rdx movq -1856(%rbp), %rax addq %rdx, %rax movq %rax, -1616(%rbp) movq -1616(%rbp), %rax vmovaps (%rax), %ymm0 vmovaps %ymm0, -1232(%rbp) movl -1832(%rbp), %eax sall $6, %eax movl %eax, %eax addq $16, %rax leaq 0(,%rax,4), %rdx movq -1848(%rbp), %rax addq %rdx, %rax movq %rax, -1624(%rbp) movq -1624(%rbp), %rax vmovaps (%rax), %ymm0 vmovaps %ymm0, -1200(%rbp) movl -1832(%rbp), %eax sall $6, %eax movl %eax, %eax addq $16, %rax leaq 0(,%rax,4), %rdx movq -1856(%rbp), %rax addq %rdx, %rax movq %rax, -1632(%rbp) movq -1632(%rbp), %rax vmovaps (%rax), %ymm0 vmovaps %ymm0, -1168(%rbp) movl -1832(%rbp), %eax sall $6, %eax movl %eax, %eax addq $24, %rax leaq 0(,%rax,4), %rdx movq -1848(%rbp), %rax addq %rdx, %rax movq %rax, -1640(%rbp) movq -1640(%rbp), %rax vmovaps (%rax), %ymm0 vmovaps %ymm0, -1136(%rbp) movl -1832(%rbp), %eax sall $6, %eax movl %eax, %eax addq $24, %rax leaq 0(,%rax,4), %rdx movq -1856(%rbp), %rax addq %rdx, %rax movq %rax, -1648(%rbp) movq -1648(%rbp), %rax vmovaps (%rax), %ymm0 vmovaps %ymm0, -1104(%rbp) movl -1832(%rbp), %eax sall $6, %eax movl %eax, %eax addq $32, %rax leaq 0(,%rax,4), %rdx movq -1848(%rbp), %rax addq %rdx, %rax movq %rax, -1656(%rbp) movq -1656(%rbp), %rax vmovaps (%rax), %ymm0 vmovaps %ymm0, -1072(%rbp) movl -1832(%rbp), %eax sall $6, %eax movl %eax, %eax addq $32, %rax leaq 0(,%rax,4), %rdx movq -1856(%rbp), %rax addq %rdx, %rax movq %rax, -1664(%rbp) movq -1664(%rbp), %rax vmovaps (%rax), %ymm0 vmovaps %ymm0, -1040(%rbp) movl -1832(%rbp), %eax sall $6, %eax movl %eax, %eax addq $40, %rax leaq 0(,%rax,4), %rdx movq -1848(%rbp), %rax addq %rdx, %rax movq %rax, -1672(%rbp) movq -1672(%rbp), %rax vmovaps (%rax), %ymm0 vmovaps %ymm0, -1008(%rbp) movl -1832(%rbp), %eax sall $6, %eax movl %eax, %eax addq $40, %rax leaq 0(,%rax,4), %rdx movq -1856(%rbp), %rax addq %rdx, %rax movq %rax, -1680(%rbp) movq -1680(%rbp), %rax vmovaps (%rax), %ymm0 vmovaps %ymm0, -976(%rbp) movl -1832(%rbp), %eax sall $6, %eax movl %eax, %eax addq $48, %rax leaq 0(,%rax,4), %rdx movq -1848(%rbp), %rax addq %rdx, %rax movq %rax, -1688(%rbp) movq -1688(%rbp), %rax vmovaps (%rax), %ymm0 vmovaps %ymm0, -944(%rbp) movl -1832(%rbp), %eax sall $6, %eax movl %eax, %eax addq $48, %rax leaq 0(,%rax,4), %rdx movq -1856(%rbp), %rax addq %rdx, %rax movq %rax, -1696(%rbp) movq -1696(%rbp), %rax vmovaps (%rax), %ymm0 vmovaps %ymm0, -912(%rbp) movl -1832(%rbp), %eax sall $6, %eax movl %eax, %eax addq $56, %rax leaq 0(,%rax,4), %rdx movq -1848(%rbp), %rax addq %rdx, %rax movq %rax, -1704(%rbp) movq -1704(%rbp), %rax vmovaps (%rax), %ymm0 vmovaps %ymm0, -880(%rbp) movl -1832(%rbp), %eax sall $6, %eax movl %eax, %eax addq $56, %rax leaq 0(,%rax,4), %rdx movq -1856(%rbp), %rax addq %rdx, %rax movq %rax, -1712(%rbp) movq -1712(%rbp), %rax vmovaps (%rax), %ymm0 vmovaps %ymm0, -848(%rbp) vmovaps -1584(%rbp), %ymm0 vmovaps -1328(%rbp), %ymm1 vmovaps %ymm1, -144(%rbp) vmovaps -1296(%rbp), %ymm1 vmovaps %ymm1, -112(%rbp) vmovaps %ymm0, -80(%rbp) vmovaps -112(%rbp), %ymm1 vmovaps -80(%rbp), %ymm0 vfmadd231ps -144(%rbp), %ymm1, %ymm0 nop vmovaps %ymm0, -1584(%rbp) vmovaps -1552(%rbp), %ymm0 vmovaps -1264(%rbp), %ymm1 vmovaps %ymm1, -240(%rbp) vmovaps -1232(%rbp), %ymm1 vmovaps %ymm1, -208(%rbp) vmovaps %ymm0, -176(%rbp) vmovaps -208(%rbp), %ymm1 vmovaps -176(%rbp), %ymm0 vfmadd231ps -240(%rbp), %ymm1, %ymm0 nop vmovaps %ymm0, -1552(%rbp) vmovaps -1520(%rbp), %ymm0 vmovaps -1200(%rbp), %ymm1 vmovaps %ymm1, -336(%rbp) vmovaps -1168(%rbp), %ymm1 vmovaps %ymm1, -304(%rbp) vmovaps %ymm0, -272(%rbp) vmovaps -304(%rbp), %ymm1 vmovaps -272(%rbp), %ymm0 vfmadd231ps -336(%rbp), %ymm1, %ymm0 nop vmovaps %ymm0, -1520(%rbp) vmovaps -1488(%rbp), %ymm0 vmovaps -1136(%rbp), %ymm1 vmovaps %ymm1, -432(%rbp) vmovaps -1104(%rbp), %ymm1 vmovaps %ymm1, -400(%rbp) vmovaps %ymm0, -368(%rbp) vmovaps -400(%rbp), %ymm1 vmovaps -368(%rbp), %ymm0 vfmadd231ps -432(%rbp), %ymm1, %ymm0 nop vmovaps %ymm0, -1488(%rbp) vmovaps -1456(%rbp), %ymm0 vmovaps -1072(%rbp), %ymm1 vmovaps %ymm1, -528(%rbp) vmovaps -1040(%rbp), %ymm1 vmovaps %ymm1, -496(%rbp) vmovaps %ymm0, -464(%rbp) vmovaps -496(%rbp), %ymm1 vmovaps -464(%rbp), %ymm0 vfmadd231ps -528(%rbp), %ymm1, %ymm0 nop vmovaps %ymm0, -1456(%rbp) vmovaps -1424(%rbp), %ymm0 vmovaps -1008(%rbp), %ymm1 vmovaps %ymm1, -624(%rbp) vmovaps -976(%rbp), %ymm1 vmovaps %ymm1, -592(%rbp) vmovaps %ymm0, -560(%rbp) vmovaps -592(%rbp), %ymm1 vmovaps -560(%rbp), %ymm0 vfmadd231ps -624(%rbp), %ymm1, %ymm0 nop vmovaps %ymm0, -1424(%rbp) vmovaps -1392(%rbp), %ymm0 vmovaps -944(%rbp), %ymm1 vmovaps %ymm1, -720(%rbp) vmovaps -912(%rbp), %ymm1 vmovaps %ymm1, -688(%rbp) vmovaps %ymm0, -656(%rbp) vmovaps -688(%rbp), %ymm1 vmovaps -656(%rbp), %ymm0 vfmadd231ps -720(%rbp), %ymm1, %ymm0 nop vmovaps %ymm0, -1392(%rbp) vmovaps -1360(%rbp), %ymm0 vmovaps -880(%rbp), %ymm1 vmovaps %ymm1, -816(%rbp) vmovaps -848(%rbp), %ymm1 vmovaps %ymm1, -784(%rbp) vmovaps %ymm0, -752(%rbp) vmovaps -784(%rbp), %ymm1 vmovaps -752(%rbp), %ymm0 vfmadd231ps -816(%rbp), %ymm1, %ymm0 nop vmovaps %ymm0, -1360(%rbp) addl $1, -1832(%rbp) .L22: cmpl $2047, -1832(%rbp) jbe .L47 vmovss -1584(%rbp), %xmm1 vmovss -1580(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1576(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1572(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1568(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1564(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1560(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1556(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm0 vmovss %xmm0, -1828(%rbp) vmovss -1552(%rbp), %xmm1 vmovss -1548(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1544(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1540(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1536(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1532(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1528(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1524(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm0 vmovss %xmm0, -1824(%rbp) vmovss -1520(%rbp), %xmm1 vmovss -1516(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1512(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1508(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1504(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1500(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1496(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1492(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm0 vmovss %xmm0, -1820(%rbp) vmovss -1488(%rbp), %xmm1 vmovss -1484(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1480(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1476(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1472(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1468(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1464(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1460(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm0 vmovss %xmm0, -1816(%rbp) vmovss -1456(%rbp), %xmm1 vmovss -1452(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1448(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1444(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1440(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1436(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1432(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1428(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm0 vmovss %xmm0, -1812(%rbp) vmovss -1424(%rbp), %xmm1 vmovss -1420(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1416(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1412(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1408(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1404(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1400(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1396(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm0 vmovss %xmm0, -1808(%rbp) vmovss -1392(%rbp), %xmm1 vmovss -1388(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1384(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1380(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1376(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1372(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1368(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1364(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm0 vmovss %xmm0, -1804(%rbp) vmovss -1360(%rbp), %xmm1 vmovss -1356(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1352(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1348(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1344(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1340(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1336(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm1 vmovss -1332(%rbp), %xmm0 vaddss %xmm0, %xmm1, %xmm0 vmovss %xmm0, -1800(%rbp) vmovss -1828(%rbp), %xmm0 vaddss -1824(%rbp), %xmm0, %xmm0 vaddss -1820(%rbp), %xmm0, %xmm0 vaddss -1816(%rbp), %xmm0, %xmm0 vaddss -1812(%rbp), %xmm0, %xmm0 vaddss -1808(%rbp), %xmm0, %xmm0 vaddss -1804(%rbp), %xmm0, %xmm0 vmovss -1800(%rbp), %xmm1 vaddss %xmm0, %xmm1, %xmm0 vmovss %xmm0, -1796(%rbp) movl $0, %eax call get_time vmovq %xmm0, %rax movq %rax, -1720(%rbp) vmovsd -1720(%rbp), %xmm0 vsubsd -1728(%rbp), %xmm0, %xmm0 movl -1796(%rbp), %eax vmovapd %xmm0, %xmm1 vmovd %eax, %xmm0 call print_result nop movq -24(%rbp), %rax xorq %fs:40, %rax je .L48 call __stack_chk_fail@PLT .L48: addq $1864, %rsp popq %r10 .cfi_def_cfa 10, 0 popq %rbp leaq -8(%r10), %rsp .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE4043: .size unroll_vector, .-unroll_vector .section .rodata .LC9: .string "========= Naive =========" .LC10: .string "========= Unroll =========" .align 8 .LC11: .string "========= Vector Instruction(FMA) =========" .align 8 .LC12: .string "========= Unroll + Vector Instruction(FMA) =========" .text .globl main .type main, @function main: .LFB4044: .cfi_startproc endbr64 pushq %rbp .cfi_def_cfa_offset 16 .cfi_offset 6, -16 movq %rsp, %rbp .cfi_def_cfa_register 6 subq $64, %rsp movq %fs:40, %rax movq %rax, -8(%rbp) xorl %eax, %eax leaq -32(%rbp), %rax movl $0, %esi movq %rax, %rdi call gettimeofday@PLT movq -32(%rbp), %rax movl %eax, %edi call srand@PLT movl $524288, %esi movl $32, %edi call aligned_alloc@PLT movq %rax, -48(%rbp) movl $524288, %esi movl $32, %edi call aligned_alloc@PLT movq %rax, -40(%rbp) movl $0, -52(%rbp) jmp .L50 .L51: call rand@PLT vcvtsi2ssl %eax, %xmm0, %xmm0 movl -52(%rbp), %eax leaq 0(,%rax,4), %rdx movq -48(%rbp), %rax addq %rdx, %rax vmovss .LC8(%rip), %xmm1 vdivss %xmm1, %xmm0, %xmm0 vmovss %xmm0, (%rax) call rand@PLT vcvtsi2ssl %eax, %xmm0, %xmm0 movl -52(%rbp), %eax leaq 0(,%rax,4), %rdx movq -40(%rbp), %rax addq %rdx, %rax vmovss .LC8(%rip), %xmm1 vdivss %xmm1, %xmm0, %xmm0 vmovss %xmm0, (%rax) addl $1, -52(%rbp) .L50: cmpl $131071, -52(%rbp) jbe .L51 leaq .LC9(%rip), %rdi call puts@PLT movq -40(%rbp), %rdx movq -48(%rbp), %rax movq %rdx, %rsi movq %rax, %rdi call naive leaq .LC10(%rip), %rdi call puts@PLT movq -40(%rbp), %rdx movq -48(%rbp), %rax movq %rdx, %rsi movq %rax, %rdi call unroll leaq .LC11(%rip), %rdi call puts@PLT movq -40(%rbp), %rdx movq -48(%rbp), %rax movq %rdx, %rsi movq %rax, %rdi call vector leaq .LC12(%rip), %rdi call puts@PLT movq -40(%rbp), %rdx movq -48(%rbp), %rax movq %rdx, %rsi movq %rax, %rdi call unroll_vector movl $0, %eax movq -8(%rbp), %rcx xorq %fs:40, %rcx je .L53 call __stack_chk_fail@PLT .L53: leave .cfi_def_cfa 7, 8 ret .cfi_endproc .LFE4044: .size main, .-main .section .rodata .align 8 .LC0: .long 2696277389 .long 1051772663 .align 8 .LC4: .long 0 .long 1091567616 .align 8 .LC5: .long 0 .long 1104006501 .align 4 .LC8: .long 1325400064 .ident "GCC: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0" .section .note.GNU-stack,"",@progbits .section .note.gnu.property,"a" .align 8 .long 1f - 0f .long 4f - 1f .long 5 0: .string "GNU" 1: .align 8 .long 0xc0000002 .long 3f - 2f 2: .long 0x3 3: .align 8 4: