1187 lines
25 KiB
ArmAsm
1187 lines
25 KiB
ArmAsm
|
.file "fma.c"
|
||
|
.text
|
||
|
.globl get_time
|
||
|
.type get_time, @function
|
||
|
get_time:
|
||
|
.LFB4038:
|
||
|
.cfi_startproc
|
||
|
endbr64
|
||
|
pushq %rbp
|
||
|
.cfi_def_cfa_offset 16
|
||
|
.cfi_offset 6, -16
|
||
|
movq %rsp, %rbp
|
||
|
.cfi_def_cfa_register 6
|
||
|
subq $32, %rsp
|
||
|
movq %fs:40, %rax
|
||
|
movq %rax, -8(%rbp)
|
||
|
xorl %eax, %eax
|
||
|
leaq -32(%rbp), %rax
|
||
|
movl $0, %esi
|
||
|
movq %rax, %rdi
|
||
|
call gettimeofday@PLT
|
||
|
movq -32(%rbp), %rax
|
||
|
vcvtsi2sdq %rax, %xmm1, %xmm1
|
||
|
movq -24(%rbp), %rax
|
||
|
vcvtsi2sdq %rax, %xmm2, %xmm2
|
||
|
vmovsd .LC0(%rip), %xmm0
|
||
|
vmulsd %xmm0, %xmm2, %xmm0
|
||
|
vaddsd %xmm0, %xmm1, %xmm0
|
||
|
movq -8(%rbp), %rax
|
||
|
xorq %fs:40, %rax
|
||
|
je .L3
|
||
|
call __stack_chk_fail@PLT
|
||
|
.L3:
|
||
|
leave
|
||
|
.cfi_def_cfa 7, 8
|
||
|
ret
|
||
|
.cfi_endproc
|
||
|
.LFE4038:
|
||
|
.size get_time, .-get_time
|
||
|
.section .rodata
|
||
|
.LC1:
|
||
|
.string "Result : %f\n"
|
||
|
.LC2:
|
||
|
.string "N : %d\n"
|
||
|
.LC3:
|
||
|
.string "Elapsed time : %f sec\n"
|
||
|
.LC6:
|
||
|
.string "Throughput : %.5f GFLOPS\n\n"
|
||
|
.text
|
||
|
.globl print_result
|
||
|
.type print_result, @function
|
||
|
print_result:
|
||
|
.LFB4039:
|
||
|
.cfi_startproc
|
||
|
endbr64
|
||
|
pushq %rbp
|
||
|
.cfi_def_cfa_offset 16
|
||
|
.cfi_offset 6, -16
|
||
|
movq %rsp, %rbp
|
||
|
.cfi_def_cfa_register 6
|
||
|
subq $16, %rsp
|
||
|
vmovss %xmm0, -4(%rbp)
|
||
|
vmovsd %xmm1, -16(%rbp)
|
||
|
vcvtss2sd -4(%rbp), %xmm0, %xmm0
|
||
|
leaq .LC1(%rip), %rdi
|
||
|
movl $1, %eax
|
||
|
call printf@PLT
|
||
|
movl $131072, %esi
|
||
|
leaq .LC2(%rip), %rdi
|
||
|
movl $0, %eax
|
||
|
call printf@PLT
|
||
|
movq -16(%rbp), %rax
|
||
|
vmovq %rax, %xmm0
|
||
|
leaq .LC3(%rip), %rdi
|
||
|
movl $1, %eax
|
||
|
call printf@PLT
|
||
|
vmovsd .LC4(%rip), %xmm0
|
||
|
vdivsd -16(%rbp), %xmm0, %xmm0
|
||
|
vmovsd .LC5(%rip), %xmm1
|
||
|
vdivsd %xmm1, %xmm0, %xmm0
|
||
|
leaq .LC6(%rip), %rdi
|
||
|
movl $1, %eax
|
||
|
call printf@PLT
|
||
|
nop
|
||
|
leave
|
||
|
.cfi_def_cfa 7, 8
|
||
|
ret
|
||
|
.cfi_endproc
|
||
|
.LFE4039:
|
||
|
.size print_result, .-print_result
|
||
|
.globl naive
|
||
|
.type naive, @function
|
||
|
naive:
|
||
|
.LFB4040:
|
||
|
.cfi_startproc
|
||
|
endbr64
|
||
|
pushq %rbp
|
||
|
.cfi_def_cfa_offset 16
|
||
|
.cfi_offset 6, -16
|
||
|
movq %rsp, %rbp
|
||
|
.cfi_def_cfa_register 6
|
||
|
subq $48, %rsp
|
||
|
movq %rdi, -40(%rbp)
|
||
|
movq %rsi, -48(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -24(%rbp)
|
||
|
movl $0, %eax
|
||
|
call get_time
|
||
|
vmovq %xmm0, %rax
|
||
|
movq %rax, -16(%rbp)
|
||
|
movl $0, -20(%rbp)
|
||
|
jmp .L6
|
||
|
.L7:
|
||
|
movl -20(%rbp), %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -40(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
vmovss (%rax), %xmm1
|
||
|
movl -20(%rbp), %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -48(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
vmovss (%rax), %xmm0
|
||
|
vmulss %xmm0, %xmm1, %xmm0
|
||
|
vmovss -24(%rbp), %xmm1
|
||
|
vaddss %xmm0, %xmm1, %xmm0
|
||
|
vmovss %xmm0, -24(%rbp)
|
||
|
addl $1, -20(%rbp)
|
||
|
.L6:
|
||
|
cmpl $131071, -20(%rbp)
|
||
|
jbe .L7
|
||
|
movl $0, %eax
|
||
|
call get_time
|
||
|
vmovq %xmm0, %rax
|
||
|
movq %rax, -8(%rbp)
|
||
|
vmovsd -8(%rbp), %xmm0
|
||
|
vsubsd -16(%rbp), %xmm0, %xmm0
|
||
|
movl -24(%rbp), %eax
|
||
|
vmovapd %xmm0, %xmm1
|
||
|
vmovd %eax, %xmm0
|
||
|
call print_result
|
||
|
nop
|
||
|
leave
|
||
|
.cfi_def_cfa 7, 8
|
||
|
ret
|
||
|
.cfi_endproc
|
||
|
.LFE4040:
|
||
|
.size naive, .-naive
|
||
|
.globl unroll
|
||
|
.type unroll, @function
|
||
|
unroll:
|
||
|
.LFB4041:
|
||
|
.cfi_startproc
|
||
|
endbr64
|
||
|
pushq %rbp
|
||
|
.cfi_def_cfa_offset 16
|
||
|
.cfi_offset 6, -16
|
||
|
movq %rsp, %rbp
|
||
|
.cfi_def_cfa_register 6
|
||
|
subq $80, %rsp
|
||
|
movq %rdi, -72(%rbp)
|
||
|
movq %rsi, -80(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -28(%rbp)
|
||
|
vmovss -28(%rbp), %xmm0
|
||
|
vmovss %xmm0, -32(%rbp)
|
||
|
vmovss -32(%rbp), %xmm0
|
||
|
vmovss %xmm0, -36(%rbp)
|
||
|
vmovss -36(%rbp), %xmm0
|
||
|
vmovss %xmm0, -40(%rbp)
|
||
|
vmovss -40(%rbp), %xmm0
|
||
|
vmovss %xmm0, -44(%rbp)
|
||
|
vmovss -44(%rbp), %xmm0
|
||
|
vmovss %xmm0, -48(%rbp)
|
||
|
vmovss -48(%rbp), %xmm0
|
||
|
vmovss %xmm0, -52(%rbp)
|
||
|
vmovss -52(%rbp), %xmm0
|
||
|
vmovss %xmm0, -56(%rbp)
|
||
|
movl $0, %eax
|
||
|
call get_time
|
||
|
vmovq %xmm0, %rax
|
||
|
movq %rax, -16(%rbp)
|
||
|
movl $0, -24(%rbp)
|
||
|
jmp .L9
|
||
|
.L10:
|
||
|
movl -24(%rbp), %eax
|
||
|
sall $3, %eax
|
||
|
movl %eax, %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -72(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
vmovss (%rax), %xmm1
|
||
|
movl -24(%rbp), %eax
|
||
|
sall $3, %eax
|
||
|
movl %eax, %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -80(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
vmovss (%rax), %xmm0
|
||
|
vmulss %xmm0, %xmm1, %xmm0
|
||
|
vmovss -56(%rbp), %xmm1
|
||
|
vaddss %xmm0, %xmm1, %xmm0
|
||
|
vmovss %xmm0, -56(%rbp)
|
||
|
movl -24(%rbp), %eax
|
||
|
sall $3, %eax
|
||
|
addl $1, %eax
|
||
|
movl %eax, %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -72(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
vmovss (%rax), %xmm1
|
||
|
movl -24(%rbp), %eax
|
||
|
sall $3, %eax
|
||
|
addl $1, %eax
|
||
|
movl %eax, %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -80(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
vmovss (%rax), %xmm0
|
||
|
vmulss %xmm0, %xmm1, %xmm0
|
||
|
vmovss -52(%rbp), %xmm1
|
||
|
vaddss %xmm0, %xmm1, %xmm0
|
||
|
vmovss %xmm0, -52(%rbp)
|
||
|
movl -24(%rbp), %eax
|
||
|
sall $3, %eax
|
||
|
addl $2, %eax
|
||
|
movl %eax, %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -72(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
vmovss (%rax), %xmm1
|
||
|
movl -24(%rbp), %eax
|
||
|
sall $3, %eax
|
||
|
addl $2, %eax
|
||
|
movl %eax, %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -80(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
vmovss (%rax), %xmm0
|
||
|
vmulss %xmm0, %xmm1, %xmm0
|
||
|
vmovss -48(%rbp), %xmm1
|
||
|
vaddss %xmm0, %xmm1, %xmm0
|
||
|
vmovss %xmm0, -48(%rbp)
|
||
|
movl -24(%rbp), %eax
|
||
|
sall $3, %eax
|
||
|
addl $3, %eax
|
||
|
movl %eax, %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -72(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
vmovss (%rax), %xmm1
|
||
|
movl -24(%rbp), %eax
|
||
|
sall $3, %eax
|
||
|
addl $3, %eax
|
||
|
movl %eax, %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -80(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
vmovss (%rax), %xmm0
|
||
|
vmulss %xmm0, %xmm1, %xmm0
|
||
|
vmovss -44(%rbp), %xmm1
|
||
|
vaddss %xmm0, %xmm1, %xmm0
|
||
|
vmovss %xmm0, -44(%rbp)
|
||
|
movl -24(%rbp), %eax
|
||
|
sall $3, %eax
|
||
|
addl $4, %eax
|
||
|
movl %eax, %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -72(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
vmovss (%rax), %xmm1
|
||
|
movl -24(%rbp), %eax
|
||
|
sall $3, %eax
|
||
|
addl $4, %eax
|
||
|
movl %eax, %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -80(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
vmovss (%rax), %xmm0
|
||
|
vmulss %xmm0, %xmm1, %xmm0
|
||
|
vmovss -40(%rbp), %xmm1
|
||
|
vaddss %xmm0, %xmm1, %xmm0
|
||
|
vmovss %xmm0, -40(%rbp)
|
||
|
movl -24(%rbp), %eax
|
||
|
sall $3, %eax
|
||
|
addl $5, %eax
|
||
|
movl %eax, %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -72(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
vmovss (%rax), %xmm1
|
||
|
movl -24(%rbp), %eax
|
||
|
sall $3, %eax
|
||
|
addl $5, %eax
|
||
|
movl %eax, %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -80(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
vmovss (%rax), %xmm0
|
||
|
vmulss %xmm0, %xmm1, %xmm0
|
||
|
vmovss -36(%rbp), %xmm1
|
||
|
vaddss %xmm0, %xmm1, %xmm0
|
||
|
vmovss %xmm0, -36(%rbp)
|
||
|
movl -24(%rbp), %eax
|
||
|
sall $3, %eax
|
||
|
addl $6, %eax
|
||
|
movl %eax, %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -72(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
vmovss (%rax), %xmm1
|
||
|
movl -24(%rbp), %eax
|
||
|
sall $3, %eax
|
||
|
addl $6, %eax
|
||
|
movl %eax, %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -80(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
vmovss (%rax), %xmm0
|
||
|
vmulss %xmm0, %xmm1, %xmm0
|
||
|
vmovss -32(%rbp), %xmm1
|
||
|
vaddss %xmm0, %xmm1, %xmm0
|
||
|
vmovss %xmm0, -32(%rbp)
|
||
|
movl -24(%rbp), %eax
|
||
|
sall $3, %eax
|
||
|
addl $7, %eax
|
||
|
movl %eax, %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -72(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
vmovss (%rax), %xmm1
|
||
|
movl -24(%rbp), %eax
|
||
|
sall $3, %eax
|
||
|
addl $7, %eax
|
||
|
movl %eax, %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -80(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
vmovss (%rax), %xmm0
|
||
|
vmulss %xmm0, %xmm1, %xmm0
|
||
|
vmovss -28(%rbp), %xmm1
|
||
|
vaddss %xmm0, %xmm1, %xmm0
|
||
|
vmovss %xmm0, -28(%rbp)
|
||
|
addl $1, -24(%rbp)
|
||
|
.L9:
|
||
|
cmpl $16383, -24(%rbp)
|
||
|
jbe .L10
|
||
|
vmovss -56(%rbp), %xmm0
|
||
|
vaddss -52(%rbp), %xmm0, %xmm0
|
||
|
vaddss -48(%rbp), %xmm0, %xmm0
|
||
|
vaddss -44(%rbp), %xmm0, %xmm0
|
||
|
vaddss -40(%rbp), %xmm0, %xmm0
|
||
|
vaddss -36(%rbp), %xmm0, %xmm0
|
||
|
vaddss -32(%rbp), %xmm0, %xmm0
|
||
|
vmovss -28(%rbp), %xmm1
|
||
|
vaddss %xmm0, %xmm1, %xmm0
|
||
|
vmovss %xmm0, -20(%rbp)
|
||
|
movl $0, %eax
|
||
|
call get_time
|
||
|
vmovq %xmm0, %rax
|
||
|
movq %rax, -8(%rbp)
|
||
|
vmovsd -8(%rbp), %xmm0
|
||
|
vsubsd -16(%rbp), %xmm0, %xmm0
|
||
|
movl -20(%rbp), %eax
|
||
|
vmovapd %xmm0, %xmm1
|
||
|
vmovd %eax, %xmm0
|
||
|
call print_result
|
||
|
nop
|
||
|
leave
|
||
|
.cfi_def_cfa 7, 8
|
||
|
ret
|
||
|
.cfi_endproc
|
||
|
.LFE4041:
|
||
|
.size unroll, .-unroll
|
||
|
.globl vector
|
||
|
.type vector, @function
|
||
|
vector:
|
||
|
.LFB4042:
|
||
|
.cfi_startproc
|
||
|
endbr64
|
||
|
leaq 8(%rsp), %r10
|
||
|
.cfi_def_cfa 10, 0
|
||
|
andq $-32, %rsp
|
||
|
pushq -8(%r10)
|
||
|
pushq %rbp
|
||
|
movq %rsp, %rbp
|
||
|
.cfi_escape 0x10,0x6,0x2,0x76,0
|
||
|
pushq %r10
|
||
|
.cfi_escape 0xf,0x3,0x76,0x78,0x6
|
||
|
subq $328, %rsp
|
||
|
movq %rdi, -328(%rbp)
|
||
|
movq %rsi, -336(%rbp)
|
||
|
movq %fs:40, %rax
|
||
|
movq %rax, -24(%rbp)
|
||
|
xorl %eax, %eax
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -304(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -300(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -296(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -292(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -288(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -284(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -280(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -276(%rbp)
|
||
|
vmovss -304(%rbp), %xmm1
|
||
|
vmovss -300(%rbp), %xmm0
|
||
|
vunpcklps %xmm1, %xmm0, %xmm2
|
||
|
vmovss -296(%rbp), %xmm1
|
||
|
vmovss -292(%rbp), %xmm0
|
||
|
vunpcklps %xmm1, %xmm0, %xmm1
|
||
|
vmovss -288(%rbp), %xmm3
|
||
|
vmovss -284(%rbp), %xmm0
|
||
|
vunpcklps %xmm3, %xmm0, %xmm3
|
||
|
vmovss -280(%rbp), %xmm4
|
||
|
vmovss -276(%rbp), %xmm0
|
||
|
vunpcklps %xmm4, %xmm0, %xmm0
|
||
|
vmovlhps %xmm3, %xmm0, %xmm0
|
||
|
vmovlhps %xmm2, %xmm1, %xmm1
|
||
|
vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
|
||
|
vmovaps %ymm0, -240(%rbp)
|
||
|
movl $0, %eax
|
||
|
call get_time
|
||
|
vmovq %xmm0, %rax
|
||
|
movq %rax, -272(%rbp)
|
||
|
movl $0, -312(%rbp)
|
||
|
jmp .L13
|
||
|
.L17:
|
||
|
movl -312(%rbp), %eax
|
||
|
sall $3, %eax
|
||
|
movl %eax, %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -328(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
movq %rax, -248(%rbp)
|
||
|
movq -248(%rbp), %rax
|
||
|
vmovaps (%rax), %ymm0
|
||
|
vmovaps %ymm0, -208(%rbp)
|
||
|
movl -312(%rbp), %eax
|
||
|
sall $3, %eax
|
||
|
movl %eax, %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -336(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
movq %rax, -256(%rbp)
|
||
|
movq -256(%rbp), %rax
|
||
|
vmovaps (%rax), %ymm0
|
||
|
vmovaps %ymm0, -176(%rbp)
|
||
|
vmovaps -240(%rbp), %ymm0
|
||
|
vmovaps -208(%rbp), %ymm1
|
||
|
vmovaps %ymm1, -144(%rbp)
|
||
|
vmovaps -176(%rbp), %ymm1
|
||
|
vmovaps %ymm1, -112(%rbp)
|
||
|
vmovaps %ymm0, -80(%rbp)
|
||
|
vmovaps -112(%rbp), %ymm1
|
||
|
vmovaps -80(%rbp), %ymm0
|
||
|
vfmadd231ps -144(%rbp), %ymm1, %ymm0
|
||
|
nop
|
||
|
vmovaps %ymm0, -240(%rbp)
|
||
|
addl $1, -312(%rbp)
|
||
|
.L13:
|
||
|
cmpl $16383, -312(%rbp)
|
||
|
jbe .L17
|
||
|
vmovss -240(%rbp), %xmm1
|
||
|
vmovss -236(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -232(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -228(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -224(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -220(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -216(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -212(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm0
|
||
|
vmovss %xmm0, -308(%rbp)
|
||
|
movl $0, %eax
|
||
|
call get_time
|
||
|
vmovq %xmm0, %rax
|
||
|
movq %rax, -264(%rbp)
|
||
|
vmovsd -264(%rbp), %xmm0
|
||
|
vsubsd -272(%rbp), %xmm0, %xmm0
|
||
|
movl -308(%rbp), %eax
|
||
|
vmovapd %xmm0, %xmm1
|
||
|
vmovd %eax, %xmm0
|
||
|
call print_result
|
||
|
nop
|
||
|
movq -24(%rbp), %rax
|
||
|
xorq %fs:40, %rax
|
||
|
je .L18
|
||
|
call __stack_chk_fail@PLT
|
||
|
.L18:
|
||
|
addq $328, %rsp
|
||
|
popq %r10
|
||
|
.cfi_def_cfa 10, 0
|
||
|
popq %rbp
|
||
|
leaq -8(%r10), %rsp
|
||
|
.cfi_def_cfa 7, 8
|
||
|
ret
|
||
|
.cfi_endproc
|
||
|
.LFE4042:
|
||
|
.size vector, .-vector
|
||
|
.globl unroll_vector
|
||
|
.type unroll_vector, @function
|
||
|
unroll_vector:
|
||
|
.LFB4043:
|
||
|
.cfi_startproc
|
||
|
endbr64
|
||
|
leaq 8(%rsp), %r10
|
||
|
.cfi_def_cfa 10, 0
|
||
|
andq $-32, %rsp
|
||
|
pushq -8(%r10)
|
||
|
pushq %rbp
|
||
|
movq %rsp, %rbp
|
||
|
.cfi_escape 0x10,0x6,0x2,0x76,0
|
||
|
pushq %r10
|
||
|
.cfi_escape 0xf,0x3,0x76,0x78,0x6
|
||
|
subq $1864, %rsp
|
||
|
movq %rdi, -1848(%rbp)
|
||
|
movq %rsi, -1856(%rbp)
|
||
|
movq %fs:40, %rax
|
||
|
movq %rax, -24(%rbp)
|
||
|
xorl %eax, %eax
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -1760(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -1756(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -1752(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -1748(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -1744(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -1740(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -1736(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -1732(%rbp)
|
||
|
vmovss -1760(%rbp), %xmm1
|
||
|
vmovss -1756(%rbp), %xmm0
|
||
|
vunpcklps %xmm1, %xmm0, %xmm2
|
||
|
vmovss -1752(%rbp), %xmm1
|
||
|
vmovss -1748(%rbp), %xmm0
|
||
|
vunpcklps %xmm1, %xmm0, %xmm1
|
||
|
vmovss -1744(%rbp), %xmm3
|
||
|
vmovss -1740(%rbp), %xmm0
|
||
|
vunpcklps %xmm3, %xmm0, %xmm3
|
||
|
vmovss -1736(%rbp), %xmm4
|
||
|
vmovss -1732(%rbp), %xmm0
|
||
|
vunpcklps %xmm4, %xmm0, %xmm0
|
||
|
vmovlhps %xmm3, %xmm0, %xmm0
|
||
|
vmovlhps %xmm2, %xmm1, %xmm1
|
||
|
vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
|
||
|
vmovaps %ymm0, -1488(%rbp)
|
||
|
vmovaps -1488(%rbp), %ymm0
|
||
|
vmovaps %ymm0, -1520(%rbp)
|
||
|
vmovaps -1520(%rbp), %ymm0
|
||
|
vmovaps %ymm0, -1552(%rbp)
|
||
|
vmovaps -1552(%rbp), %ymm0
|
||
|
vmovaps %ymm0, -1584(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -1792(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -1788(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -1784(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -1780(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -1776(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -1772(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -1768(%rbp)
|
||
|
vxorps %xmm0, %xmm0, %xmm0
|
||
|
vmovss %xmm0, -1764(%rbp)
|
||
|
vmovss -1792(%rbp), %xmm1
|
||
|
vmovss -1788(%rbp), %xmm0
|
||
|
vunpcklps %xmm1, %xmm0, %xmm2
|
||
|
vmovss -1784(%rbp), %xmm1
|
||
|
vmovss -1780(%rbp), %xmm0
|
||
|
vunpcklps %xmm1, %xmm0, %xmm1
|
||
|
vmovss -1776(%rbp), %xmm3
|
||
|
vmovss -1772(%rbp), %xmm0
|
||
|
vunpcklps %xmm3, %xmm0, %xmm3
|
||
|
vmovss -1768(%rbp), %xmm4
|
||
|
vmovss -1764(%rbp), %xmm0
|
||
|
vunpcklps %xmm4, %xmm0, %xmm0
|
||
|
vmovlhps %xmm3, %xmm0, %xmm0
|
||
|
vmovlhps %xmm2, %xmm1, %xmm1
|
||
|
vinsertf128 $0x1, %xmm1, %ymm0, %ymm0
|
||
|
vmovaps %ymm0, -1360(%rbp)
|
||
|
vmovaps -1360(%rbp), %ymm0
|
||
|
vmovaps %ymm0, -1392(%rbp)
|
||
|
vmovaps -1392(%rbp), %ymm0
|
||
|
vmovaps %ymm0, -1424(%rbp)
|
||
|
vmovaps -1424(%rbp), %ymm0
|
||
|
vmovaps %ymm0, -1456(%rbp)
|
||
|
movl $0, %eax
|
||
|
call get_time
|
||
|
vmovq %xmm0, %rax
|
||
|
movq %rax, -1728(%rbp)
|
||
|
movl $0, -1832(%rbp)
|
||
|
jmp .L22
|
||
|
.L47:
|
||
|
movl -1832(%rbp), %eax
|
||
|
sall $6, %eax
|
||
|
movl %eax, %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -1848(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
movq %rax, -1592(%rbp)
|
||
|
movq -1592(%rbp), %rax
|
||
|
vmovaps (%rax), %ymm0
|
||
|
vmovaps %ymm0, -1328(%rbp)
|
||
|
movl -1832(%rbp), %eax
|
||
|
sall $6, %eax
|
||
|
movl %eax, %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -1856(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
movq %rax, -1600(%rbp)
|
||
|
movq -1600(%rbp), %rax
|
||
|
vmovaps (%rax), %ymm0
|
||
|
vmovaps %ymm0, -1296(%rbp)
|
||
|
movl -1832(%rbp), %eax
|
||
|
sall $6, %eax
|
||
|
movl %eax, %eax
|
||
|
addq $8, %rax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -1848(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
movq %rax, -1608(%rbp)
|
||
|
movq -1608(%rbp), %rax
|
||
|
vmovaps (%rax), %ymm0
|
||
|
vmovaps %ymm0, -1264(%rbp)
|
||
|
movl -1832(%rbp), %eax
|
||
|
sall $6, %eax
|
||
|
movl %eax, %eax
|
||
|
addq $8, %rax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -1856(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
movq %rax, -1616(%rbp)
|
||
|
movq -1616(%rbp), %rax
|
||
|
vmovaps (%rax), %ymm0
|
||
|
vmovaps %ymm0, -1232(%rbp)
|
||
|
movl -1832(%rbp), %eax
|
||
|
sall $6, %eax
|
||
|
movl %eax, %eax
|
||
|
addq $16, %rax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -1848(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
movq %rax, -1624(%rbp)
|
||
|
movq -1624(%rbp), %rax
|
||
|
vmovaps (%rax), %ymm0
|
||
|
vmovaps %ymm0, -1200(%rbp)
|
||
|
movl -1832(%rbp), %eax
|
||
|
sall $6, %eax
|
||
|
movl %eax, %eax
|
||
|
addq $16, %rax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -1856(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
movq %rax, -1632(%rbp)
|
||
|
movq -1632(%rbp), %rax
|
||
|
vmovaps (%rax), %ymm0
|
||
|
vmovaps %ymm0, -1168(%rbp)
|
||
|
movl -1832(%rbp), %eax
|
||
|
sall $6, %eax
|
||
|
movl %eax, %eax
|
||
|
addq $24, %rax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -1848(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
movq %rax, -1640(%rbp)
|
||
|
movq -1640(%rbp), %rax
|
||
|
vmovaps (%rax), %ymm0
|
||
|
vmovaps %ymm0, -1136(%rbp)
|
||
|
movl -1832(%rbp), %eax
|
||
|
sall $6, %eax
|
||
|
movl %eax, %eax
|
||
|
addq $24, %rax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -1856(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
movq %rax, -1648(%rbp)
|
||
|
movq -1648(%rbp), %rax
|
||
|
vmovaps (%rax), %ymm0
|
||
|
vmovaps %ymm0, -1104(%rbp)
|
||
|
movl -1832(%rbp), %eax
|
||
|
sall $6, %eax
|
||
|
movl %eax, %eax
|
||
|
addq $32, %rax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -1848(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
movq %rax, -1656(%rbp)
|
||
|
movq -1656(%rbp), %rax
|
||
|
vmovaps (%rax), %ymm0
|
||
|
vmovaps %ymm0, -1072(%rbp)
|
||
|
movl -1832(%rbp), %eax
|
||
|
sall $6, %eax
|
||
|
movl %eax, %eax
|
||
|
addq $32, %rax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -1856(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
movq %rax, -1664(%rbp)
|
||
|
movq -1664(%rbp), %rax
|
||
|
vmovaps (%rax), %ymm0
|
||
|
vmovaps %ymm0, -1040(%rbp)
|
||
|
movl -1832(%rbp), %eax
|
||
|
sall $6, %eax
|
||
|
movl %eax, %eax
|
||
|
addq $40, %rax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -1848(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
movq %rax, -1672(%rbp)
|
||
|
movq -1672(%rbp), %rax
|
||
|
vmovaps (%rax), %ymm0
|
||
|
vmovaps %ymm0, -1008(%rbp)
|
||
|
movl -1832(%rbp), %eax
|
||
|
sall $6, %eax
|
||
|
movl %eax, %eax
|
||
|
addq $40, %rax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -1856(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
movq %rax, -1680(%rbp)
|
||
|
movq -1680(%rbp), %rax
|
||
|
vmovaps (%rax), %ymm0
|
||
|
vmovaps %ymm0, -976(%rbp)
|
||
|
movl -1832(%rbp), %eax
|
||
|
sall $6, %eax
|
||
|
movl %eax, %eax
|
||
|
addq $48, %rax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -1848(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
movq %rax, -1688(%rbp)
|
||
|
movq -1688(%rbp), %rax
|
||
|
vmovaps (%rax), %ymm0
|
||
|
vmovaps %ymm0, -944(%rbp)
|
||
|
movl -1832(%rbp), %eax
|
||
|
sall $6, %eax
|
||
|
movl %eax, %eax
|
||
|
addq $48, %rax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -1856(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
movq %rax, -1696(%rbp)
|
||
|
movq -1696(%rbp), %rax
|
||
|
vmovaps (%rax), %ymm0
|
||
|
vmovaps %ymm0, -912(%rbp)
|
||
|
movl -1832(%rbp), %eax
|
||
|
sall $6, %eax
|
||
|
movl %eax, %eax
|
||
|
addq $56, %rax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -1848(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
movq %rax, -1704(%rbp)
|
||
|
movq -1704(%rbp), %rax
|
||
|
vmovaps (%rax), %ymm0
|
||
|
vmovaps %ymm0, -880(%rbp)
|
||
|
movl -1832(%rbp), %eax
|
||
|
sall $6, %eax
|
||
|
movl %eax, %eax
|
||
|
addq $56, %rax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -1856(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
movq %rax, -1712(%rbp)
|
||
|
movq -1712(%rbp), %rax
|
||
|
vmovaps (%rax), %ymm0
|
||
|
vmovaps %ymm0, -848(%rbp)
|
||
|
vmovaps -1584(%rbp), %ymm0
|
||
|
vmovaps -1328(%rbp), %ymm1
|
||
|
vmovaps %ymm1, -144(%rbp)
|
||
|
vmovaps -1296(%rbp), %ymm1
|
||
|
vmovaps %ymm1, -112(%rbp)
|
||
|
vmovaps %ymm0, -80(%rbp)
|
||
|
vmovaps -112(%rbp), %ymm1
|
||
|
vmovaps -80(%rbp), %ymm0
|
||
|
vfmadd231ps -144(%rbp), %ymm1, %ymm0
|
||
|
nop
|
||
|
vmovaps %ymm0, -1584(%rbp)
|
||
|
vmovaps -1552(%rbp), %ymm0
|
||
|
vmovaps -1264(%rbp), %ymm1
|
||
|
vmovaps %ymm1, -240(%rbp)
|
||
|
vmovaps -1232(%rbp), %ymm1
|
||
|
vmovaps %ymm1, -208(%rbp)
|
||
|
vmovaps %ymm0, -176(%rbp)
|
||
|
vmovaps -208(%rbp), %ymm1
|
||
|
vmovaps -176(%rbp), %ymm0
|
||
|
vfmadd231ps -240(%rbp), %ymm1, %ymm0
|
||
|
nop
|
||
|
vmovaps %ymm0, -1552(%rbp)
|
||
|
vmovaps -1520(%rbp), %ymm0
|
||
|
vmovaps -1200(%rbp), %ymm1
|
||
|
vmovaps %ymm1, -336(%rbp)
|
||
|
vmovaps -1168(%rbp), %ymm1
|
||
|
vmovaps %ymm1, -304(%rbp)
|
||
|
vmovaps %ymm0, -272(%rbp)
|
||
|
vmovaps -304(%rbp), %ymm1
|
||
|
vmovaps -272(%rbp), %ymm0
|
||
|
vfmadd231ps -336(%rbp), %ymm1, %ymm0
|
||
|
nop
|
||
|
vmovaps %ymm0, -1520(%rbp)
|
||
|
vmovaps -1488(%rbp), %ymm0
|
||
|
vmovaps -1136(%rbp), %ymm1
|
||
|
vmovaps %ymm1, -432(%rbp)
|
||
|
vmovaps -1104(%rbp), %ymm1
|
||
|
vmovaps %ymm1, -400(%rbp)
|
||
|
vmovaps %ymm0, -368(%rbp)
|
||
|
vmovaps -400(%rbp), %ymm1
|
||
|
vmovaps -368(%rbp), %ymm0
|
||
|
vfmadd231ps -432(%rbp), %ymm1, %ymm0
|
||
|
nop
|
||
|
vmovaps %ymm0, -1488(%rbp)
|
||
|
vmovaps -1456(%rbp), %ymm0
|
||
|
vmovaps -1072(%rbp), %ymm1
|
||
|
vmovaps %ymm1, -528(%rbp)
|
||
|
vmovaps -1040(%rbp), %ymm1
|
||
|
vmovaps %ymm1, -496(%rbp)
|
||
|
vmovaps %ymm0, -464(%rbp)
|
||
|
vmovaps -496(%rbp), %ymm1
|
||
|
vmovaps -464(%rbp), %ymm0
|
||
|
vfmadd231ps -528(%rbp), %ymm1, %ymm0
|
||
|
nop
|
||
|
vmovaps %ymm0, -1456(%rbp)
|
||
|
vmovaps -1424(%rbp), %ymm0
|
||
|
vmovaps -1008(%rbp), %ymm1
|
||
|
vmovaps %ymm1, -624(%rbp)
|
||
|
vmovaps -976(%rbp), %ymm1
|
||
|
vmovaps %ymm1, -592(%rbp)
|
||
|
vmovaps %ymm0, -560(%rbp)
|
||
|
vmovaps -592(%rbp), %ymm1
|
||
|
vmovaps -560(%rbp), %ymm0
|
||
|
vfmadd231ps -624(%rbp), %ymm1, %ymm0
|
||
|
nop
|
||
|
vmovaps %ymm0, -1424(%rbp)
|
||
|
vmovaps -1392(%rbp), %ymm0
|
||
|
vmovaps -944(%rbp), %ymm1
|
||
|
vmovaps %ymm1, -720(%rbp)
|
||
|
vmovaps -912(%rbp), %ymm1
|
||
|
vmovaps %ymm1, -688(%rbp)
|
||
|
vmovaps %ymm0, -656(%rbp)
|
||
|
vmovaps -688(%rbp), %ymm1
|
||
|
vmovaps -656(%rbp), %ymm0
|
||
|
vfmadd231ps -720(%rbp), %ymm1, %ymm0
|
||
|
nop
|
||
|
vmovaps %ymm0, -1392(%rbp)
|
||
|
vmovaps -1360(%rbp), %ymm0
|
||
|
vmovaps -880(%rbp), %ymm1
|
||
|
vmovaps %ymm1, -816(%rbp)
|
||
|
vmovaps -848(%rbp), %ymm1
|
||
|
vmovaps %ymm1, -784(%rbp)
|
||
|
vmovaps %ymm0, -752(%rbp)
|
||
|
vmovaps -784(%rbp), %ymm1
|
||
|
vmovaps -752(%rbp), %ymm0
|
||
|
vfmadd231ps -816(%rbp), %ymm1, %ymm0
|
||
|
nop
|
||
|
vmovaps %ymm0, -1360(%rbp)
|
||
|
addl $1, -1832(%rbp)
|
||
|
.L22:
|
||
|
cmpl $2047, -1832(%rbp)
|
||
|
jbe .L47
|
||
|
vmovss -1584(%rbp), %xmm1
|
||
|
vmovss -1580(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1576(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1572(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1568(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1564(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1560(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1556(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm0
|
||
|
vmovss %xmm0, -1828(%rbp)
|
||
|
vmovss -1552(%rbp), %xmm1
|
||
|
vmovss -1548(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1544(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1540(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1536(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1532(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1528(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1524(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm0
|
||
|
vmovss %xmm0, -1824(%rbp)
|
||
|
vmovss -1520(%rbp), %xmm1
|
||
|
vmovss -1516(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1512(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1508(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1504(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1500(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1496(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1492(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm0
|
||
|
vmovss %xmm0, -1820(%rbp)
|
||
|
vmovss -1488(%rbp), %xmm1
|
||
|
vmovss -1484(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1480(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1476(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1472(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1468(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1464(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1460(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm0
|
||
|
vmovss %xmm0, -1816(%rbp)
|
||
|
vmovss -1456(%rbp), %xmm1
|
||
|
vmovss -1452(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1448(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1444(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1440(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1436(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1432(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1428(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm0
|
||
|
vmovss %xmm0, -1812(%rbp)
|
||
|
vmovss -1424(%rbp), %xmm1
|
||
|
vmovss -1420(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1416(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1412(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1408(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1404(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1400(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1396(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm0
|
||
|
vmovss %xmm0, -1808(%rbp)
|
||
|
vmovss -1392(%rbp), %xmm1
|
||
|
vmovss -1388(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1384(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1380(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1376(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1372(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1368(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1364(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm0
|
||
|
vmovss %xmm0, -1804(%rbp)
|
||
|
vmovss -1360(%rbp), %xmm1
|
||
|
vmovss -1356(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1352(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1348(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1344(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1340(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1336(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm1
|
||
|
vmovss -1332(%rbp), %xmm0
|
||
|
vaddss %xmm0, %xmm1, %xmm0
|
||
|
vmovss %xmm0, -1800(%rbp)
|
||
|
vmovss -1828(%rbp), %xmm0
|
||
|
vaddss -1824(%rbp), %xmm0, %xmm0
|
||
|
vaddss -1820(%rbp), %xmm0, %xmm0
|
||
|
vaddss -1816(%rbp), %xmm0, %xmm0
|
||
|
vaddss -1812(%rbp), %xmm0, %xmm0
|
||
|
vaddss -1808(%rbp), %xmm0, %xmm0
|
||
|
vaddss -1804(%rbp), %xmm0, %xmm0
|
||
|
vmovss -1800(%rbp), %xmm1
|
||
|
vaddss %xmm0, %xmm1, %xmm0
|
||
|
vmovss %xmm0, -1796(%rbp)
|
||
|
movl $0, %eax
|
||
|
call get_time
|
||
|
vmovq %xmm0, %rax
|
||
|
movq %rax, -1720(%rbp)
|
||
|
vmovsd -1720(%rbp), %xmm0
|
||
|
vsubsd -1728(%rbp), %xmm0, %xmm0
|
||
|
movl -1796(%rbp), %eax
|
||
|
vmovapd %xmm0, %xmm1
|
||
|
vmovd %eax, %xmm0
|
||
|
call print_result
|
||
|
nop
|
||
|
movq -24(%rbp), %rax
|
||
|
xorq %fs:40, %rax
|
||
|
je .L48
|
||
|
call __stack_chk_fail@PLT
|
||
|
.L48:
|
||
|
addq $1864, %rsp
|
||
|
popq %r10
|
||
|
.cfi_def_cfa 10, 0
|
||
|
popq %rbp
|
||
|
leaq -8(%r10), %rsp
|
||
|
.cfi_def_cfa 7, 8
|
||
|
ret
|
||
|
.cfi_endproc
|
||
|
.LFE4043:
|
||
|
.size unroll_vector, .-unroll_vector
|
||
|
.section .rodata
|
||
|
.LC9:
|
||
|
.string "========= Naive ========="
|
||
|
.LC10:
|
||
|
.string "========= Unroll ========="
|
||
|
.align 8
|
||
|
.LC11:
|
||
|
.string "========= Vector Instruction(FMA) ========="
|
||
|
.align 8
|
||
|
.LC12:
|
||
|
.string "========= Unroll + Vector Instruction(FMA) ========="
|
||
|
.text
|
||
|
.globl main
|
||
|
.type main, @function
|
||
|
main:
|
||
|
.LFB4044:
|
||
|
.cfi_startproc
|
||
|
endbr64
|
||
|
pushq %rbp
|
||
|
.cfi_def_cfa_offset 16
|
||
|
.cfi_offset 6, -16
|
||
|
movq %rsp, %rbp
|
||
|
.cfi_def_cfa_register 6
|
||
|
subq $64, %rsp
|
||
|
movq %fs:40, %rax
|
||
|
movq %rax, -8(%rbp)
|
||
|
xorl %eax, %eax
|
||
|
leaq -32(%rbp), %rax
|
||
|
movl $0, %esi
|
||
|
movq %rax, %rdi
|
||
|
call gettimeofday@PLT
|
||
|
movq -32(%rbp), %rax
|
||
|
movl %eax, %edi
|
||
|
call srand@PLT
|
||
|
movl $524288, %esi
|
||
|
movl $32, %edi
|
||
|
call aligned_alloc@PLT
|
||
|
movq %rax, -48(%rbp)
|
||
|
movl $524288, %esi
|
||
|
movl $32, %edi
|
||
|
call aligned_alloc@PLT
|
||
|
movq %rax, -40(%rbp)
|
||
|
movl $0, -52(%rbp)
|
||
|
jmp .L50
|
||
|
.L51:
|
||
|
call rand@PLT
|
||
|
vcvtsi2ssl %eax, %xmm0, %xmm0
|
||
|
movl -52(%rbp), %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -48(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
vmovss .LC8(%rip), %xmm1
|
||
|
vdivss %xmm1, %xmm0, %xmm0
|
||
|
vmovss %xmm0, (%rax)
|
||
|
call rand@PLT
|
||
|
vcvtsi2ssl %eax, %xmm0, %xmm0
|
||
|
movl -52(%rbp), %eax
|
||
|
leaq 0(,%rax,4), %rdx
|
||
|
movq -40(%rbp), %rax
|
||
|
addq %rdx, %rax
|
||
|
vmovss .LC8(%rip), %xmm1
|
||
|
vdivss %xmm1, %xmm0, %xmm0
|
||
|
vmovss %xmm0, (%rax)
|
||
|
addl $1, -52(%rbp)
|
||
|
.L50:
|
||
|
cmpl $131071, -52(%rbp)
|
||
|
jbe .L51
|
||
|
leaq .LC9(%rip), %rdi
|
||
|
call puts@PLT
|
||
|
movq -40(%rbp), %rdx
|
||
|
movq -48(%rbp), %rax
|
||
|
movq %rdx, %rsi
|
||
|
movq %rax, %rdi
|
||
|
call naive
|
||
|
leaq .LC10(%rip), %rdi
|
||
|
call puts@PLT
|
||
|
movq -40(%rbp), %rdx
|
||
|
movq -48(%rbp), %rax
|
||
|
movq %rdx, %rsi
|
||
|
movq %rax, %rdi
|
||
|
call unroll
|
||
|
leaq .LC11(%rip), %rdi
|
||
|
call puts@PLT
|
||
|
movq -40(%rbp), %rdx
|
||
|
movq -48(%rbp), %rax
|
||
|
movq %rdx, %rsi
|
||
|
movq %rax, %rdi
|
||
|
call vector
|
||
|
leaq .LC12(%rip), %rdi
|
||
|
call puts@PLT
|
||
|
movq -40(%rbp), %rdx
|
||
|
movq -48(%rbp), %rax
|
||
|
movq %rdx, %rsi
|
||
|
movq %rax, %rdi
|
||
|
call unroll_vector
|
||
|
movl $0, %eax
|
||
|
movq -8(%rbp), %rcx
|
||
|
xorq %fs:40, %rcx
|
||
|
je .L53
|
||
|
call __stack_chk_fail@PLT
|
||
|
.L53:
|
||
|
leave
|
||
|
.cfi_def_cfa 7, 8
|
||
|
ret
|
||
|
.cfi_endproc
|
||
|
.LFE4044:
|
||
|
.size main, .-main
|
||
|
.section .rodata
|
||
|
.align 8
|
||
|
.LC0:
|
||
|
.long 2696277389
|
||
|
.long 1051772663
|
||
|
.align 8
|
||
|
.LC4:
|
||
|
.long 0
|
||
|
.long 1091567616
|
||
|
.align 8
|
||
|
.LC5:
|
||
|
.long 0
|
||
|
.long 1104006501
|
||
|
.align 4
|
||
|
.LC8:
|
||
|
.long 1325400064
|
||
|
.ident "GCC: (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0"
|
||
|
.section .note.GNU-stack,"",@progbits
|
||
|
.section .note.gnu.property,"a"
|
||
|
.align 8
|
||
|
.long 1f - 0f
|
||
|
.long 4f - 1f
|
||
|
.long 5
|
||
|
0:
|
||
|
.string "GNU"
|
||
|
1:
|
||
|
.align 8
|
||
|
.long 0xc0000002
|
||
|
.long 3f - 2f
|
||
|
2:
|
||
|
.long 0x3
|
||
|
3:
|
||
|
.align 8
|
||
|
4:
|