chundoong-lab-ta/SamsungDS22/fma-example/fma.s

1187 lines
25 KiB
ArmAsm

.file "fma.c"
.text
.globl get_time
.type get_time, @function
get_time:
.LFB4038:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $32, %rsp
movq %fs:40, %rax
movq %rax, -8(%rbp)
xorl %eax, %eax
leaq -32(%rbp), %rax
movl $0, %esi
movq %rax, %rdi
call gettimeofday@PLT
movq -32(%rbp), %rax
vcvtsi2sdq %rax, %xmm1, %xmm1
movq -24(%rbp), %rax
vcvtsi2sdq %rax, %xmm2, %xmm2
vmovsd .LC0(%rip), %xmm0
vmulsd %xmm0, %xmm2, %xmm0
vaddsd %xmm0, %xmm1, %xmm0
movq -8(%rbp), %rax
xorq %fs:40, %rax
je .L3
call __stack_chk_fail@PLT
.L3:
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE4038:
.size get_time, .-get_time
.section .rodata
.LC1:
.string "Result : %f\n"
.LC2:
.string "N : %d\n"
.LC3:
.string "Elapsed time : %f sec\n"
.LC6:
.string "Throughput : %.5f GFLOPS\n\n"
.text
.globl print_result
.type print_result, @function
print_result:
.LFB4039:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $16, %rsp
vmovss %xmm0, -4(%rbp)
vmovsd %xmm1, -16(%rbp)
vcvtss2sd -4(%rbp), %xmm0, %xmm0
leaq .LC1(%rip), %rdi
movl $1, %eax
call printf@PLT
movl $131072, %esi
leaq .LC2(%rip), %rdi
movl $0, %eax
call printf@PLT
movq -16(%rbp), %rax
vmovq %rax, %xmm0
leaq .LC3(%rip), %rdi
movl $1, %eax
call printf@PLT
vmovsd .LC4(%rip), %xmm0
vdivsd -16(%rbp), %xmm0, %xmm0
vmovsd .LC5(%rip), %xmm1
vdivsd %xmm1, %xmm0, %xmm0
leaq .LC6(%rip), %rdi
movl $1, %eax
call printf@PLT
nop
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE4039:
.size print_result, .-print_result
.globl naive
.type naive, @function
naive:
.LFB4040:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $48, %rsp
movq %rdi, -40(%rbp)
movq %rsi, -48(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -24(%rbp)
movl $0, %eax
call get_time
vmovq %xmm0, %rax
movq %rax, -16(%rbp)
movl $0, -20(%rbp)
jmp .L6
.L7:
movl -20(%rbp), %eax
leaq 0(,%rax,4), %rdx
movq -40(%rbp), %rax
addq %rdx, %rax
vmovss (%rax), %xmm1
movl -20(%rbp), %eax
leaq 0(,%rax,4), %rdx
movq -48(%rbp), %rax
addq %rdx, %rax
vmovss (%rax), %xmm0
vmulss %xmm0, %xmm1, %xmm0
vmovss -24(%rbp), %xmm1
vaddss %xmm0, %xmm1, %xmm0
vmovss %xmm0, -24(%rbp)
addl $1, -20(%rbp)
.L6:
cmpl $131071, -20(%rbp)
jbe .L7
movl $0, %eax
call get_time
vmovq %xmm0, %rax
movq %rax, -8(%rbp)
vmovsd -8(%rbp), %xmm0
vsubsd -16(%rbp), %xmm0, %xmm0
movl -24(%rbp), %eax
vmovapd %xmm0, %xmm1
vmovd %eax, %xmm0
call print_result
nop
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE4040:
.size naive, .-naive
.globl unroll
.type unroll, @function
unroll:
.LFB4041:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $80, %rsp
movq %rdi, -72(%rbp)
movq %rsi, -80(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -28(%rbp)
vmovss -28(%rbp), %xmm0
vmovss %xmm0, -32(%rbp)
vmovss -32(%rbp), %xmm0
vmovss %xmm0, -36(%rbp)
vmovss -36(%rbp), %xmm0
vmovss %xmm0, -40(%rbp)
vmovss -40(%rbp), %xmm0
vmovss %xmm0, -44(%rbp)
vmovss -44(%rbp), %xmm0
vmovss %xmm0, -48(%rbp)
vmovss -48(%rbp), %xmm0
vmovss %xmm0, -52(%rbp)
vmovss -52(%rbp), %xmm0
vmovss %xmm0, -56(%rbp)
movl $0, %eax
call get_time
vmovq %xmm0, %rax
movq %rax, -16(%rbp)
movl $0, -24(%rbp)
jmp .L9
.L10:
movl -24(%rbp), %eax
sall $3, %eax
movl %eax, %eax
leaq 0(,%rax,4), %rdx
movq -72(%rbp), %rax
addq %rdx, %rax
vmovss (%rax), %xmm1
movl -24(%rbp), %eax
sall $3, %eax
movl %eax, %eax
leaq 0(,%rax,4), %rdx
movq -80(%rbp), %rax
addq %rdx, %rax
vmovss (%rax), %xmm0
vmulss %xmm0, %xmm1, %xmm0
vmovss -56(%rbp), %xmm1
vaddss %xmm0, %xmm1, %xmm0
vmovss %xmm0, -56(%rbp)
movl -24(%rbp), %eax
sall $3, %eax
addl $1, %eax
movl %eax, %eax
leaq 0(,%rax,4), %rdx
movq -72(%rbp), %rax
addq %rdx, %rax
vmovss (%rax), %xmm1
movl -24(%rbp), %eax
sall $3, %eax
addl $1, %eax
movl %eax, %eax
leaq 0(,%rax,4), %rdx
movq -80(%rbp), %rax
addq %rdx, %rax
vmovss (%rax), %xmm0
vmulss %xmm0, %xmm1, %xmm0
vmovss -52(%rbp), %xmm1
vaddss %xmm0, %xmm1, %xmm0
vmovss %xmm0, -52(%rbp)
movl -24(%rbp), %eax
sall $3, %eax
addl $2, %eax
movl %eax, %eax
leaq 0(,%rax,4), %rdx
movq -72(%rbp), %rax
addq %rdx, %rax
vmovss (%rax), %xmm1
movl -24(%rbp), %eax
sall $3, %eax
addl $2, %eax
movl %eax, %eax
leaq 0(,%rax,4), %rdx
movq -80(%rbp), %rax
addq %rdx, %rax
vmovss (%rax), %xmm0
vmulss %xmm0, %xmm1, %xmm0
vmovss -48(%rbp), %xmm1
vaddss %xmm0, %xmm1, %xmm0
vmovss %xmm0, -48(%rbp)
movl -24(%rbp), %eax
sall $3, %eax
addl $3, %eax
movl %eax, %eax
leaq 0(,%rax,4), %rdx
movq -72(%rbp), %rax
addq %rdx, %rax
vmovss (%rax), %xmm1
movl -24(%rbp), %eax
sall $3, %eax
addl $3, %eax
movl %eax, %eax
leaq 0(,%rax,4), %rdx
movq -80(%rbp), %rax
addq %rdx, %rax
vmovss (%rax), %xmm0
vmulss %xmm0, %xmm1, %xmm0
vmovss -44(%rbp), %xmm1
vaddss %xmm0, %xmm1, %xmm0
vmovss %xmm0, -44(%rbp)
movl -24(%rbp), %eax
sall $3, %eax
addl $4, %eax
movl %eax, %eax
leaq 0(,%rax,4), %rdx
movq -72(%rbp), %rax
addq %rdx, %rax
vmovss (%rax), %xmm1
movl -24(%rbp), %eax
sall $3, %eax
addl $4, %eax
movl %eax, %eax
leaq 0(,%rax,4), %rdx
movq -80(%rbp), %rax
addq %rdx, %rax
vmovss (%rax), %xmm0
vmulss %xmm0, %xmm1, %xmm0
vmovss -40(%rbp), %xmm1
vaddss %xmm0, %xmm1, %xmm0
vmovss %xmm0, -40(%rbp)
movl -24(%rbp), %eax
sall $3, %eax
addl $5, %eax
movl %eax, %eax
leaq 0(,%rax,4), %rdx
movq -72(%rbp), %rax
addq %rdx, %rax
vmovss (%rax), %xmm1
movl -24(%rbp), %eax
sall $3, %eax
addl $5, %eax
movl %eax, %eax
leaq 0(,%rax,4), %rdx
movq -80(%rbp), %rax
addq %rdx, %rax
vmovss (%rax), %xmm0
vmulss %xmm0, %xmm1, %xmm0
vmovss -36(%rbp), %xmm1
vaddss %xmm0, %xmm1, %xmm0
vmovss %xmm0, -36(%rbp)
movl -24(%rbp), %eax
sall $3, %eax
addl $6, %eax
movl %eax, %eax
leaq 0(,%rax,4), %rdx
movq -72(%rbp), %rax
addq %rdx, %rax
vmovss (%rax), %xmm1
movl -24(%rbp), %eax
sall $3, %eax
addl $6, %eax
movl %eax, %eax
leaq 0(,%rax,4), %rdx
movq -80(%rbp), %rax
addq %rdx, %rax
vmovss (%rax), %xmm0
vmulss %xmm0, %xmm1, %xmm0
vmovss -32(%rbp), %xmm1
vaddss %xmm0, %xmm1, %xmm0
vmovss %xmm0, -32(%rbp)
movl -24(%rbp), %eax
sall $3, %eax
addl $7, %eax
movl %eax, %eax
leaq 0(,%rax,4), %rdx
movq -72(%rbp), %rax
addq %rdx, %rax
vmovss (%rax), %xmm1
movl -24(%rbp), %eax
sall $3, %eax
addl $7, %eax
movl %eax, %eax
leaq 0(,%rax,4), %rdx
movq -80(%rbp), %rax
addq %rdx, %rax
vmovss (%rax), %xmm0
vmulss %xmm0, %xmm1, %xmm0
vmovss -28(%rbp), %xmm1
vaddss %xmm0, %xmm1, %xmm0
vmovss %xmm0, -28(%rbp)
addl $1, -24(%rbp)
.L9:
cmpl $16383, -24(%rbp)
jbe .L10
vmovss -56(%rbp), %xmm0
vaddss -52(%rbp), %xmm0, %xmm0
vaddss -48(%rbp), %xmm0, %xmm0
vaddss -44(%rbp), %xmm0, %xmm0
vaddss -40(%rbp), %xmm0, %xmm0
vaddss -36(%rbp), %xmm0, %xmm0
vaddss -32(%rbp), %xmm0, %xmm0
vmovss -28(%rbp), %xmm1
vaddss %xmm0, %xmm1, %xmm0
vmovss %xmm0, -20(%rbp)
movl $0, %eax
call get_time
vmovq %xmm0, %rax
movq %rax, -8(%rbp)
vmovsd -8(%rbp), %xmm0
vsubsd -16(%rbp), %xmm0, %xmm0
movl -20(%rbp), %eax
vmovapd %xmm0, %xmm1
vmovd %eax, %xmm0
call print_result
nop
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE4041:
.size unroll, .-unroll
.globl vector
.type vector, @function
vector:
.LFB4042:
.cfi_startproc
endbr64
leaq 8(%rsp), %r10
.cfi_def_cfa 10, 0
andq $-32, %rsp
pushq -8(%r10)
pushq %rbp
movq %rsp, %rbp
.cfi_escape 0x10,0x6,0x2,0x76,0
pushq %r10
.cfi_escape 0xf,0x3,0x76,0x78,0x6
subq $328, %rsp
movq %rdi, -328(%rbp)
movq %rsi, -336(%rbp)
movq %fs:40, %rax
movq %rax, -24(%rbp)
xorl %eax, %eax
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -304(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -300(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -296(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -292(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -288(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -284(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -280(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -276(%rbp)
vmovss -304(%rbp), %xmm1
vmovss -300(%rbp), %xmm0
vunpcklps %xmm1, %xmm0, %xmm2
vmovss -296(%rbp), %xmm1
vmovss -292(%rbp), %xmm0
vunpcklps %xmm1, %xmm0, %xmm1
vmovss -288(%rbp), %xmm3
vmovss -284(%rbp), %xmm0
vunpcklps %xmm3, %xmm0, %xmm3
vmovss -280(%rbp), %xmm4
vmovss -276(%rbp), %xmm0
vunpcklps %xmm4, %xmm0, %xmm0
vmovlhps %xmm3, %xmm0, %xmm0
vmovlhps %xmm2, %xmm1, %xmm1
vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
vmovaps %ymm0, -240(%rbp)
movl $0, %eax
call get_time
vmovq %xmm0, %rax
movq %rax, -272(%rbp)
movl $0, -312(%rbp)
jmp .L13
.L17:
movl -312(%rbp), %eax
sall $3, %eax
movl %eax, %eax
leaq 0(,%rax,4), %rdx
movq -328(%rbp), %rax
addq %rdx, %rax
movq %rax, -248(%rbp)
movq -248(%rbp), %rax
vmovaps (%rax), %ymm0
vmovaps %ymm0, -208(%rbp)
movl -312(%rbp), %eax
sall $3, %eax
movl %eax, %eax
leaq 0(,%rax,4), %rdx
movq -336(%rbp), %rax
addq %rdx, %rax
movq %rax, -256(%rbp)
movq -256(%rbp), %rax
vmovaps (%rax), %ymm0
vmovaps %ymm0, -176(%rbp)
vmovaps -240(%rbp), %ymm0
vmovaps -208(%rbp), %ymm1
vmovaps %ymm1, -144(%rbp)
vmovaps -176(%rbp), %ymm1
vmovaps %ymm1, -112(%rbp)
vmovaps %ymm0, -80(%rbp)
vmovaps -112(%rbp), %ymm1
vmovaps -80(%rbp), %ymm0
vfmadd231ps -144(%rbp), %ymm1, %ymm0
nop
vmovaps %ymm0, -240(%rbp)
addl $1, -312(%rbp)
.L13:
cmpl $16383, -312(%rbp)
jbe .L17
vmovss -240(%rbp), %xmm1
vmovss -236(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -232(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -228(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -224(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -220(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -216(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -212(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm0
vmovss %xmm0, -308(%rbp)
movl $0, %eax
call get_time
vmovq %xmm0, %rax
movq %rax, -264(%rbp)
vmovsd -264(%rbp), %xmm0
vsubsd -272(%rbp), %xmm0, %xmm0
movl -308(%rbp), %eax
vmovapd %xmm0, %xmm1
vmovd %eax, %xmm0
call print_result
nop
movq -24(%rbp), %rax
xorq %fs:40, %rax
je .L18
call __stack_chk_fail@PLT
.L18:
addq $328, %rsp
popq %r10
.cfi_def_cfa 10, 0
popq %rbp
leaq -8(%r10), %rsp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE4042:
.size vector, .-vector
.globl unroll_vector
.type unroll_vector, @function
unroll_vector:
.LFB4043:
.cfi_startproc
endbr64
leaq 8(%rsp), %r10
.cfi_def_cfa 10, 0
andq $-32, %rsp
pushq -8(%r10)
pushq %rbp
movq %rsp, %rbp
.cfi_escape 0x10,0x6,0x2,0x76,0
pushq %r10
.cfi_escape 0xf,0x3,0x76,0x78,0x6
subq $1864, %rsp
movq %rdi, -1848(%rbp)
movq %rsi, -1856(%rbp)
movq %fs:40, %rax
movq %rax, -24(%rbp)
xorl %eax, %eax
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -1760(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -1756(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -1752(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -1748(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -1744(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -1740(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -1736(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -1732(%rbp)
vmovss -1760(%rbp), %xmm1
vmovss -1756(%rbp), %xmm0
vunpcklps %xmm1, %xmm0, %xmm2
vmovss -1752(%rbp), %xmm1
vmovss -1748(%rbp), %xmm0
vunpcklps %xmm1, %xmm0, %xmm1
vmovss -1744(%rbp), %xmm3
vmovss -1740(%rbp), %xmm0
vunpcklps %xmm3, %xmm0, %xmm3
vmovss -1736(%rbp), %xmm4
vmovss -1732(%rbp), %xmm0
vunpcklps %xmm4, %xmm0, %xmm0
vmovlhps %xmm3, %xmm0, %xmm0
vmovlhps %xmm2, %xmm1, %xmm1
vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
vmovaps %ymm0, -1488(%rbp)
vmovaps -1488(%rbp), %ymm0
vmovaps %ymm0, -1520(%rbp)
vmovaps -1520(%rbp), %ymm0
vmovaps %ymm0, -1552(%rbp)
vmovaps -1552(%rbp), %ymm0
vmovaps %ymm0, -1584(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -1792(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -1788(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -1784(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -1780(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -1776(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -1772(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -1768(%rbp)
vxorps %xmm0, %xmm0, %xmm0
vmovss %xmm0, -1764(%rbp)
vmovss -1792(%rbp), %xmm1
vmovss -1788(%rbp), %xmm0
vunpcklps %xmm1, %xmm0, %xmm2
vmovss -1784(%rbp), %xmm1
vmovss -1780(%rbp), %xmm0
vunpcklps %xmm1, %xmm0, %xmm1
vmovss -1776(%rbp), %xmm3
vmovss -1772(%rbp), %xmm0
vunpcklps %xmm3, %xmm0, %xmm3
vmovss -1768(%rbp), %xmm4
vmovss -1764(%rbp), %xmm0
vunpcklps %xmm4, %xmm0, %xmm0
vmovlhps %xmm3, %xmm0, %xmm0
vmovlhps %xmm2, %xmm1, %xmm1
vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
vmovaps %ymm0, -1360(%rbp)
vmovaps -1360(%rbp), %ymm0
vmovaps %ymm0, -1392(%rbp)
vmovaps -1392(%rbp), %ymm0
vmovaps %ymm0, -1424(%rbp)
vmovaps -1424(%rbp), %ymm0
vmovaps %ymm0, -1456(%rbp)
movl $0, %eax
call get_time
vmovq %xmm0, %rax
movq %rax, -1728(%rbp)
movl $0, -1832(%rbp)
jmp .L22
.L47:
movl -1832(%rbp), %eax
sall $6, %eax
movl %eax, %eax
leaq 0(,%rax,4), %rdx
movq -1848(%rbp), %rax
addq %rdx, %rax
movq %rax, -1592(%rbp)
movq -1592(%rbp), %rax
vmovaps (%rax), %ymm0
vmovaps %ymm0, -1328(%rbp)
movl -1832(%rbp), %eax
sall $6, %eax
movl %eax, %eax
leaq 0(,%rax,4), %rdx
movq -1856(%rbp), %rax
addq %rdx, %rax
movq %rax, -1600(%rbp)
movq -1600(%rbp), %rax
vmovaps (%rax), %ymm0
vmovaps %ymm0, -1296(%rbp)
movl -1832(%rbp), %eax
sall $6, %eax
movl %eax, %eax
addq $8, %rax
leaq 0(,%rax,4), %rdx
movq -1848(%rbp), %rax
addq %rdx, %rax
movq %rax, -1608(%rbp)
movq -1608(%rbp), %rax
vmovaps (%rax), %ymm0
vmovaps %ymm0, -1264(%rbp)
movl -1832(%rbp), %eax
sall $6, %eax
movl %eax, %eax
addq $8, %rax
leaq 0(,%rax,4), %rdx
movq -1856(%rbp), %rax
addq %rdx, %rax
movq %rax, -1616(%rbp)
movq -1616(%rbp), %rax
vmovaps (%rax), %ymm0
vmovaps %ymm0, -1232(%rbp)
movl -1832(%rbp), %eax
sall $6, %eax
movl %eax, %eax
addq $16, %rax
leaq 0(,%rax,4), %rdx
movq -1848(%rbp), %rax
addq %rdx, %rax
movq %rax, -1624(%rbp)
movq -1624(%rbp), %rax
vmovaps (%rax), %ymm0
vmovaps %ymm0, -1200(%rbp)
movl -1832(%rbp), %eax
sall $6, %eax
movl %eax, %eax
addq $16, %rax
leaq 0(,%rax,4), %rdx
movq -1856(%rbp), %rax
addq %rdx, %rax
movq %rax, -1632(%rbp)
movq -1632(%rbp), %rax
vmovaps (%rax), %ymm0
vmovaps %ymm0, -1168(%rbp)
movl -1832(%rbp), %eax
sall $6, %eax
movl %eax, %eax
addq $24, %rax
leaq 0(,%rax,4), %rdx
movq -1848(%rbp), %rax
addq %rdx, %rax
movq %rax, -1640(%rbp)
movq -1640(%rbp), %rax
vmovaps (%rax), %ymm0
vmovaps %ymm0, -1136(%rbp)
movl -1832(%rbp), %eax
sall $6, %eax
movl %eax, %eax
addq $24, %rax
leaq 0(,%rax,4), %rdx
movq -1856(%rbp), %rax
addq %rdx, %rax
movq %rax, -1648(%rbp)
movq -1648(%rbp), %rax
vmovaps (%rax), %ymm0
vmovaps %ymm0, -1104(%rbp)
movl -1832(%rbp), %eax
sall $6, %eax
movl %eax, %eax
addq $32, %rax
leaq 0(,%rax,4), %rdx
movq -1848(%rbp), %rax
addq %rdx, %rax
movq %rax, -1656(%rbp)
movq -1656(%rbp), %rax
vmovaps (%rax), %ymm0
vmovaps %ymm0, -1072(%rbp)
movl -1832(%rbp), %eax
sall $6, %eax
movl %eax, %eax
addq $32, %rax
leaq 0(,%rax,4), %rdx
movq -1856(%rbp), %rax
addq %rdx, %rax
movq %rax, -1664(%rbp)
movq -1664(%rbp), %rax
vmovaps (%rax), %ymm0
vmovaps %ymm0, -1040(%rbp)
movl -1832(%rbp), %eax
sall $6, %eax
movl %eax, %eax
addq $40, %rax
leaq 0(,%rax,4), %rdx
movq -1848(%rbp), %rax
addq %rdx, %rax
movq %rax, -1672(%rbp)
movq -1672(%rbp), %rax
vmovaps (%rax), %ymm0
vmovaps %ymm0, -1008(%rbp)
movl -1832(%rbp), %eax
sall $6, %eax
movl %eax, %eax
addq $40, %rax
leaq 0(,%rax,4), %rdx
movq -1856(%rbp), %rax
addq %rdx, %rax
movq %rax, -1680(%rbp)
movq -1680(%rbp), %rax
vmovaps (%rax), %ymm0
vmovaps %ymm0, -976(%rbp)
movl -1832(%rbp), %eax
sall $6, %eax
movl %eax, %eax
addq $48, %rax
leaq 0(,%rax,4), %rdx
movq -1848(%rbp), %rax
addq %rdx, %rax
movq %rax, -1688(%rbp)
movq -1688(%rbp), %rax
vmovaps (%rax), %ymm0
vmovaps %ymm0, -944(%rbp)
movl -1832(%rbp), %eax
sall $6, %eax
movl %eax, %eax
addq $48, %rax
leaq 0(,%rax,4), %rdx
movq -1856(%rbp), %rax
addq %rdx, %rax
movq %rax, -1696(%rbp)
movq -1696(%rbp), %rax
vmovaps (%rax), %ymm0
vmovaps %ymm0, -912(%rbp)
movl -1832(%rbp), %eax
sall $6, %eax
movl %eax, %eax
addq $56, %rax
leaq 0(,%rax,4), %rdx
movq -1848(%rbp), %rax
addq %rdx, %rax
movq %rax, -1704(%rbp)
movq -1704(%rbp), %rax
vmovaps (%rax), %ymm0
vmovaps %ymm0, -880(%rbp)
movl -1832(%rbp), %eax
sall $6, %eax
movl %eax, %eax
addq $56, %rax
leaq 0(,%rax,4), %rdx
movq -1856(%rbp), %rax
addq %rdx, %rax
movq %rax, -1712(%rbp)
movq -1712(%rbp), %rax
vmovaps (%rax), %ymm0
vmovaps %ymm0, -848(%rbp)
vmovaps -1584(%rbp), %ymm0
vmovaps -1328(%rbp), %ymm1
vmovaps %ymm1, -144(%rbp)
vmovaps -1296(%rbp), %ymm1
vmovaps %ymm1, -112(%rbp)
vmovaps %ymm0, -80(%rbp)
vmovaps -112(%rbp), %ymm1
vmovaps -80(%rbp), %ymm0
vfmadd231ps -144(%rbp), %ymm1, %ymm0
nop
vmovaps %ymm0, -1584(%rbp)
vmovaps -1552(%rbp), %ymm0
vmovaps -1264(%rbp), %ymm1
vmovaps %ymm1, -240(%rbp)
vmovaps -1232(%rbp), %ymm1
vmovaps %ymm1, -208(%rbp)
vmovaps %ymm0, -176(%rbp)
vmovaps -208(%rbp), %ymm1
vmovaps -176(%rbp), %ymm0
vfmadd231ps -240(%rbp), %ymm1, %ymm0
nop
vmovaps %ymm0, -1552(%rbp)
vmovaps -1520(%rbp), %ymm0
vmovaps -1200(%rbp), %ymm1
vmovaps %ymm1, -336(%rbp)
vmovaps -1168(%rbp), %ymm1
vmovaps %ymm1, -304(%rbp)
vmovaps %ymm0, -272(%rbp)
vmovaps -304(%rbp), %ymm1
vmovaps -272(%rbp), %ymm0
vfmadd231ps -336(%rbp), %ymm1, %ymm0
nop
vmovaps %ymm0, -1520(%rbp)
vmovaps -1488(%rbp), %ymm0
vmovaps -1136(%rbp), %ymm1
vmovaps %ymm1, -432(%rbp)
vmovaps -1104(%rbp), %ymm1
vmovaps %ymm1, -400(%rbp)
vmovaps %ymm0, -368(%rbp)
vmovaps -400(%rbp), %ymm1
vmovaps -368(%rbp), %ymm0
vfmadd231ps -432(%rbp), %ymm1, %ymm0
nop
vmovaps %ymm0, -1488(%rbp)
vmovaps -1456(%rbp), %ymm0
vmovaps -1072(%rbp), %ymm1
vmovaps %ymm1, -528(%rbp)
vmovaps -1040(%rbp), %ymm1
vmovaps %ymm1, -496(%rbp)
vmovaps %ymm0, -464(%rbp)
vmovaps -496(%rbp), %ymm1
vmovaps -464(%rbp), %ymm0
vfmadd231ps -528(%rbp), %ymm1, %ymm0
nop
vmovaps %ymm0, -1456(%rbp)
vmovaps -1424(%rbp), %ymm0
vmovaps -1008(%rbp), %ymm1
vmovaps %ymm1, -624(%rbp)
vmovaps -976(%rbp), %ymm1
vmovaps %ymm1, -592(%rbp)
vmovaps %ymm0, -560(%rbp)
vmovaps -592(%rbp), %ymm1
vmovaps -560(%rbp), %ymm0
vfmadd231ps -624(%rbp), %ymm1, %ymm0
nop
vmovaps %ymm0, -1424(%rbp)
vmovaps -1392(%rbp), %ymm0
vmovaps -944(%rbp), %ymm1
vmovaps %ymm1, -720(%rbp)
vmovaps -912(%rbp), %ymm1
vmovaps %ymm1, -688(%rbp)
vmovaps %ymm0, -656(%rbp)
vmovaps -688(%rbp), %ymm1
vmovaps -656(%rbp), %ymm0
vfmadd231ps -720(%rbp), %ymm1, %ymm0
nop
vmovaps %ymm0, -1392(%rbp)
vmovaps -1360(%rbp), %ymm0
vmovaps -880(%rbp), %ymm1
vmovaps %ymm1, -816(%rbp)
vmovaps -848(%rbp), %ymm1
vmovaps %ymm1, -784(%rbp)
vmovaps %ymm0, -752(%rbp)
vmovaps -784(%rbp), %ymm1
vmovaps -752(%rbp), %ymm0
vfmadd231ps -816(%rbp), %ymm1, %ymm0
nop
vmovaps %ymm0, -1360(%rbp)
addl $1, -1832(%rbp)
.L22:
cmpl $2047, -1832(%rbp)
jbe .L47
vmovss -1584(%rbp), %xmm1
vmovss -1580(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1576(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1572(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1568(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1564(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1560(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1556(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm0
vmovss %xmm0, -1828(%rbp)
vmovss -1552(%rbp), %xmm1
vmovss -1548(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1544(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1540(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1536(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1532(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1528(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1524(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm0
vmovss %xmm0, -1824(%rbp)
vmovss -1520(%rbp), %xmm1
vmovss -1516(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1512(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1508(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1504(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1500(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1496(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1492(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm0
vmovss %xmm0, -1820(%rbp)
vmovss -1488(%rbp), %xmm1
vmovss -1484(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1480(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1476(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1472(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1468(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1464(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1460(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm0
vmovss %xmm0, -1816(%rbp)
vmovss -1456(%rbp), %xmm1
vmovss -1452(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1448(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1444(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1440(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1436(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1432(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1428(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm0
vmovss %xmm0, -1812(%rbp)
vmovss -1424(%rbp), %xmm1
vmovss -1420(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1416(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1412(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1408(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1404(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1400(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1396(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm0
vmovss %xmm0, -1808(%rbp)
vmovss -1392(%rbp), %xmm1
vmovss -1388(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1384(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1380(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1376(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1372(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1368(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1364(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm0
vmovss %xmm0, -1804(%rbp)
vmovss -1360(%rbp), %xmm1
vmovss -1356(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1352(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1348(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1344(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1340(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1336(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm1
vmovss -1332(%rbp), %xmm0
vaddss %xmm0, %xmm1, %xmm0
vmovss %xmm0, -1800(%rbp)
vmovss -1828(%rbp), %xmm0
vaddss -1824(%rbp), %xmm0, %xmm0
vaddss -1820(%rbp), %xmm0, %xmm0
vaddss -1816(%rbp), %xmm0, %xmm0
vaddss -1812(%rbp), %xmm0, %xmm0
vaddss -1808(%rbp), %xmm0, %xmm0
vaddss -1804(%rbp), %xmm0, %xmm0
vmovss -1800(%rbp), %xmm1
vaddss %xmm0, %xmm1, %xmm0
vmovss %xmm0, -1796(%rbp)
movl $0, %eax
call get_time
vmovq %xmm0, %rax
movq %rax, -1720(%rbp)
vmovsd -1720(%rbp), %xmm0
vsubsd -1728(%rbp), %xmm0, %xmm0
movl -1796(%rbp), %eax
vmovapd %xmm0, %xmm1
vmovd %eax, %xmm0
call print_result
nop
movq -24(%rbp), %rax
xorq %fs:40, %rax
je .L48
call __stack_chk_fail@PLT
.L48:
addq $1864, %rsp
popq %r10
.cfi_def_cfa 10, 0
popq %rbp
leaq -8(%r10), %rsp
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE4043:
.size unroll_vector, .-unroll_vector
.section .rodata
.LC9:
.string "========= Naive ========="
.LC10:
.string "========= Unroll ========="
.align 8
.LC11:
.string "========= Vector Instruction(FMA) ========="
.align 8
.LC12:
.string "========= Unroll + Vector Instruction(FMA) ========="
.text
.globl main
.type main, @function
main:
.LFB4044:
.cfi_startproc
endbr64
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
movq %rsp, %rbp
.cfi_def_cfa_register 6
subq $64, %rsp
movq %fs:40, %rax
movq %rax, -8(%rbp)
xorl %eax, %eax
leaq -32(%rbp), %rax
movl $0, %esi
movq %rax, %rdi
call gettimeofday@PLT
movq -32(%rbp), %rax
movl %eax, %edi
call srand@PLT
movl $524288, %esi
movl $32, %edi
call aligned_alloc@PLT
movq %rax, -48(%rbp)
movl $524288, %esi
movl $32, %edi
call aligned_alloc@PLT
movq %rax, -40(%rbp)
movl $0, -52(%rbp)
jmp .L50
.L51:
call rand@PLT
vcvtsi2ssl %eax, %xmm0, %xmm0
movl -52(%rbp), %eax
leaq 0(,%rax,4), %rdx
movq -48(%rbp), %rax
addq %rdx, %rax
vmovss .LC8(%rip), %xmm1
vdivss %xmm1, %xmm0, %xmm0
vmovss %xmm0, (%rax)
call rand@PLT
vcvtsi2ssl %eax, %xmm0, %xmm0
movl -52(%rbp), %eax
leaq 0(,%rax,4), %rdx
movq -40(%rbp), %rax
addq %rdx, %rax
vmovss .LC8(%rip), %xmm1
vdivss %xmm1, %xmm0, %xmm0
vmovss %xmm0, (%rax)
addl $1, -52(%rbp)
.L50:
cmpl $131071, -52(%rbp)
jbe .L51
leaq .LC9(%rip), %rdi
call puts@PLT
movq -40(%rbp), %rdx
movq -48(%rbp), %rax
movq %rdx, %rsi
movq %rax, %rdi
call naive
leaq .LC10(%rip), %rdi
call puts@PLT
movq -40(%rbp), %rdx
movq -48(%rbp), %rax
movq %rdx, %rsi
movq %rax, %rdi
call unroll
leaq .LC11(%rip), %rdi
call puts@PLT
movq -40(%rbp), %rdx
movq -48(%rbp), %rax
movq %rdx, %rsi
movq %rax, %rdi
call vector
leaq .LC12(%rip), %rdi
call puts@PLT
movq -40(%rbp), %rdx
movq -48(%rbp), %rax
movq %rdx, %rsi
movq %rax, %rdi
call unroll_vector
movl $0, %eax
movq -8(%rbp), %rcx
xorq %fs:40, %rcx
je .L53
call __stack_chk_fail@PLT
.L53:
leave
.cfi_def_cfa 7, 8
ret
.cfi_endproc
.LFE4044:
.size main, .-main
.section .rodata
.align 8
.LC0:
.long 2696277389
.long 1051772663
.align 8
.LC4:
.long 0
.long 1091567616
.align 8
.LC5:
.long 0
.long 1104006501
.align 4
.LC8:
.long 1325400064
.ident "GCC: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0"
.section .note.GNU-stack,"",@progbits
.section .note.gnu.property,"a"
.align 8
.long 1f - 0f
.long 4f - 1f
.long 5
0:
.string "GNU"
1:
.align 8
.long 0xc0000002
.long 3f - 2f
2:
.long 0x3
3:
.align 8
4: