Merge from ihse-cflags-rewrite-branch.
#
# Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
#
# This code is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License version 2 only, as
# published by the Free Software Foundation.
#
# This code is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
# version 2 for more details (a copy is included in the LICENSE file that
# accompanied this code).
#
# You should have received a copy of the GNU General Public License version
# 2 along with this work; if not, write to the Free Software Foundation,
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
#
# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
# or visit www.oracle.com if you need additional information or have any
# questions.
#
# TODO-AARCH64
# NOTE WELL! The _Copy functions are called directly
# from server-compiler-generated code via CallLeafNoFP,
# which means that they *must* either not use floating
# point or use it in the same manner as does the server
# compiler.
.globl _Copy_conjoint_bytes
.type _Copy_conjoint_bytes, %function
.globl _Copy_arrayof_conjoint_bytes
.type _Copy_arrayof_conjoint_bytes, %function
.globl _Copy_disjoint_words
.type _Copy_disjoint_words, %function
.globl _Copy_conjoint_words
.type _Copy_conjoint_words, %function
.globl _Copy_conjoint_jshorts_atomic
.type _Copy_conjoint_jshorts_atomic, %function
.globl _Copy_arrayof_conjoint_jshorts
.type _Copy_arrayof_conjoint_jshorts, %function
.globl _Copy_conjoint_jints_atomic
.type _Copy_conjoint_jints_atomic, %function
.globl _Copy_arrayof_conjoint_jints
.type _Copy_arrayof_conjoint_jints, %function
.globl _Copy_conjoint_jlongs_atomic
.type _Copy_conjoint_jlongs_atomic, %function
.globl _Copy_arrayof_conjoint_jlongs
.type _Copy_arrayof_conjoint_jlongs, %function
.text
.globl SpinPause
.type SpinPause, %function
SpinPause:
yield
ret
# Support for void Copy::conjoint_bytes(void* from,
# void* to,
# size_t count)
_Copy_conjoint_bytes:
hlt 1002
# Support for void Copy::arrayof_conjoint_bytes(void* from,
# void* to,
# size_t count)
_Copy_arrayof_conjoint_bytes:
hlt 1003
# Support for void Copy::disjoint_words(void* from,
# void* to,
# size_t count)
_Copy_disjoint_words:
# These and further memory prefetches may hit out of array ranges.
# Experiments showed that prefetching of inaccessible memory doesn't result in exceptions.
prfm pldl1keep, [x0, #0]
prfm pstl1keep, [x1, #0]
prfm pldl1keep, [x0, #64]
prfm pstl1keep, [x1, #64]
subs x18, x2, #128
b.ge dw_large
dw_lt_128:
# Copy [x0, x0 + x2) to [x1, x1 + x2)
adr x15, dw_tail_table_base
and x16, x2, #~8
# Calculate address to jump and store it to x15:
# Each pair of instructions before dw_tail_table_base copies 16 bytes.
# x16 is count of bytes to copy aligned down by 16.
# So x16/16 pairs of instructions should be executed.
# Each pair takes 8 bytes, so x15 = dw_tail_table_base - (x16/16)*8 = x15 - x16/2
sub x15, x15, x16, lsr #1
prfm plil1keep, [x15]
add x17, x0, x2
add x18, x1, x2
# If x2 = x16 + 8, then copy 8 bytes and x16 bytes after that.
# Otherwise x2 = x16, so proceed to copy x16 bytes.
tbz x2, #3, dw_lt_128_even
ldr x3, [x0]
str x3, [x1]
dw_lt_128_even:
# Copy [x17 - x16, x17) to [x18 - x16, x18)
# x16 is aligned by 16 and less than 128
# Execute (x16/16) ldp-stp pairs; each pair copies 16 bytes
br x15
ldp x3, x4, [x17, #-112]
stp x3, x4, [x18, #-112]
ldp x5, x6, [x17, #-96]
stp x5, x6, [x18, #-96]
ldp x7, x8, [x17, #-80]
stp x7, x8, [x18, #-80]
ldp x9, x10, [x17, #-64]
stp x9, x10, [x18, #-64]
ldp x11, x12, [x17, #-48]
stp x11, x12, [x18, #-48]
ldp x13, x14, [x17, #-32]
stp x13, x14, [x18, #-32]
ldp x15, x16, [x17, #-16]
stp x15, x16, [x18, #-16]
dw_tail_table_base:
ret
.p2align 6
.rept 12
nop
.endr
dw_large:
# x18 >= 0;
# Copy [x0, x0 + x18 + 128) to [x1, x1 + x18 + 128)
ldp x3, x4, [x0], #64
ldp x5, x6, [x0, #-48]
ldp x7, x8, [x0, #-32]
ldp x9, x10, [x0, #-16]
# Before and after each iteration of loop registers x3-x10 contain [x0 - 64, x0),
# and x1 is a place to copy this data;
# x18 contains number of bytes to be stored minus 128
# Exactly 16 instructions from p2align, so dw_loop starts from cache line boundary
# Checking it explictly by aligning with "hlt 1000" instructions
.p2alignl 6, 0xd4407d00
dw_loop:
prfm pldl1keep, [x0, #64]
# Next line actually hurted memory copy performance (for interpreter) - JDK-8078120
# prfm pstl1keep, [x1, #64]
subs x18, x18, #64
stp x3, x4, [x1, #0]
ldp x3, x4, [x0, #0]
stp x5, x6, [x1, #16]
ldp x5, x6, [x0, #16]
stp x7, x8, [x1, #32]
ldp x7, x8, [x0, #32]
stp x9, x10, [x1, #48]
ldp x9, x10, [x0, #48]
add x1, x1, #64
add x0, x0, #64
b.ge dw_loop
# 13 instructions from dw_loop, so the loop body hits into one cache line
dw_loop_end:
adds x2, x18, #64
stp x3, x4, [x1], #64
stp x5, x6, [x1, #-48]
stp x7, x8, [x1, #-32]
stp x9, x10, [x1, #-16]
# Increased x18 by 64, but stored 64 bytes, so x2 contains exact number of bytes to be stored
# If this number is not zero, also copy remaining bytes
b.ne dw_lt_128
ret
# Support for void Copy::conjoint_words(void* from,
# void* to,
# size_t count)
_Copy_conjoint_words:
subs x3, x1, x0
# hi condition is met <=> from < to
ccmp x2, x3, #0, hi
# hi condition is met <=> (from < to) and (to - from < count)
# otherwise _Copy_disjoint_words may be used, because it performs forward copying,
# so it also works when ranges overlap but to <= from
b.ls _Copy_disjoint_words
# Overlapping case should be the rare one, it does not worth optimizing
ands x3, x2, #~8
# x3 is count aligned down by 2*wordSize
add x0, x0, x2
add x1, x1, x2
sub x3, x3, #16
# Skip loop if 0 or 1 words
b.eq cw_backward_loop_end
# x3 >= 0
# Copy [x0 - x3 - 16, x0) to [x1 - x3 - 16, x1) backward
cw_backward_loop:
subs x3, x3, #16
ldp x4, x5, [x0, #-16]!
stp x4, x5, [x1, #-16]!
b.ge cw_backward_loop
cw_backward_loop_end:
# Copy remaining 0 or 1 words
tbz x2, #3, cw_finish
ldr x3, [x0, #-8]
str x3, [x1, #-8]
cw_finish:
ret
# Support for void Copy::conjoint_jshorts_atomic(void* from,
# void* to,
# size_t count)
_Copy_conjoint_jshorts_atomic:
add x17, x0, x2
add x18, x1, x2
subs x3, x1, x0
# hi is met <=> (from < to) and (to - from < count)
ccmp x2, x3, #0, hi
b.hi cs_backward
subs x3, x2, #14
b.ge cs_forward_loop
# Copy x2 < 14 bytes from x0 to x1
cs_forward_lt14:
ands x7, x2, #7
tbz x2, #3, cs_forward_lt8
ldrh w3, [x0, #0]
ldrh w4, [x0, #2]
ldrh w5, [x0, #4]
ldrh w6, [x0, #6]
strh w3, [x1, #0]
strh w4, [x1, #2]
strh w5, [x1, #4]
strh w6, [x1, #6]
# Copy x7 < 8 bytes from x17 - x7 to x18 - x7
cs_forward_lt8:
b.eq cs_forward_0
cmp x7, #4
b.lt cs_forward_2
b.eq cs_forward_4
cs_forward_6:
ldrh w3, [x17, #-6]
strh w3, [x18, #-6]
cs_forward_4:
ldrh w4, [x17, #-4]
strh w4, [x18, #-4]
cs_forward_2:
ldrh w5, [x17, #-2]
strh w5, [x18, #-2]
cs_forward_0:
ret
# Copy [x0, x0 + x3 + 14) to [x1, x1 + x3 + 14)
# x3 >= 0
.p2align 6
cs_forward_loop:
subs x3, x3, #14
ldrh w4, [x0], #14
ldrh w5, [x0, #-12]
ldrh w6, [x0, #-10]
ldrh w7, [x0, #-8]
ldrh w8, [x0, #-6]
ldrh w9, [x0, #-4]
ldrh w10, [x0, #-2]
strh w4, [x1], #14
strh w5, [x1, #-12]
strh w6, [x1, #-10]
strh w7, [x1, #-8]
strh w8, [x1, #-6]
strh w9, [x1, #-4]
strh w10, [x1, #-2]
b.ge cs_forward_loop
# Exactly 16 instruction from cs_forward_loop, so loop fits into one cache line
adds x2, x3, #14
# x2 bytes should be copied from x0 to x1
b.ne cs_forward_lt14
ret
# Very similar to forward copying
cs_backward:
subs x3, x2, #14
b.ge cs_backward_loop
cs_backward_lt14:
ands x7, x2, #7
tbz x2, #3, cs_backward_lt8
ldrh w3, [x17, #-8]
ldrh w4, [x17, #-6]
ldrh w5, [x17, #-4]
ldrh w6, [x17, #-2]
strh w3, [x18, #-8]
strh w4, [x18, #-6]
strh w5, [x18, #-4]
strh w6, [x18, #-2]
cs_backward_lt8:
b.eq cs_backward_0
cmp x7, #4
b.lt cs_backward_2
b.eq cs_backward_4
cs_backward_6:
ldrh w3, [x0, #4]
strh w3, [x1, #4]
cs_backward_4:
ldrh w4, [x0, #2]
strh w4, [x1, #2]
cs_backward_2:
ldrh w5, [x0, #0]
strh w5, [x1, #0]
cs_backward_0:
ret
.p2align 6
cs_backward_loop:
subs x3, x3, #14
ldrh w4, [x17, #-14]!
ldrh w5, [x17, #2]
ldrh w6, [x17, #4]
ldrh w7, [x17, #6]
ldrh w8, [x17, #8]
ldrh w9, [x17, #10]
ldrh w10, [x17, #12]
strh w4, [x18, #-14]!
strh w5, [x18, #2]
strh w6, [x18, #4]
strh w7, [x18, #6]
strh w8, [x18, #8]
strh w9, [x18, #10]
strh w10, [x18, #12]
b.ge cs_backward_loop
adds x2, x3, #14
b.ne cs_backward_lt14
ret
# Support for void Copy::arrayof_conjoint_jshorts(void* from,
# void* to,
# size_t count)
_Copy_arrayof_conjoint_jshorts:
hlt 1007
# Support for void Copy::conjoint_jlongs_atomic(jlong* from,
# jlong* to,
# size_t count)
_Copy_conjoint_jlongs_atomic:
_Copy_arrayof_conjoint_jlongs:
hlt 1009
# Support for void Copy::conjoint_jints_atomic(void* from,
# void* to,
# size_t count)
_Copy_conjoint_jints_atomic:
_Copy_arrayof_conjoint_jints:
# These and further memory prefetches may hit out of array ranges.
# Experiments showed that prefetching of inaccessible memory doesn't result in exceptions.
prfm pldl1keep, [x0, #0]
prfm pstl1keep, [x1, #0]
prfm pldl1keep, [x0, #32]
prfm pstl1keep, [x1, #32]
subs x3, x1, x0
# hi condition is met <=> from < to
ccmp x2, x3, #0, hi
# hi condition is met <=> (from < to) and (to - from < count)
b.hi ci_backward
subs x18, x2, #64
b.ge ci_forward_large
ci_forward_lt_64:
# Copy [x0, x0 + x2) to [x1, x1 + x2)
adr x15, ci_forward_tail_table_base
and x16, x2, #~4
# Calculate address to jump and store it to x15:
# Each pair of instructions before ci_forward_tail_table_base copies 8 bytes.
# x16 is count of bytes to copy aligned down by 8.
# So x16/8 pairs of instructions should be executed.
# Each pair takes 8 bytes, so x15 = ci_forward_tail_table_base - (x16/8)*8 = x15 - x16
sub x15, x15, x16
prfm plil1keep, [x15]
add x17, x0, x2
add x18, x1, x2
# If x2 = x16 + 4, then copy 4 bytes and x16 bytes after that.
# Otherwise x2 = x16, so proceed to copy x16 bytes.
tbz x2, #2, ci_forward_lt_64_even
ldr w3, [x0]
str w3, [x1]
ci_forward_lt_64_even:
# Copy [x17 - x16, x17) to [x18 - x16, x18)
# x16 is aligned by 8 and less than 64
# Execute (x16/8) ldp-stp pairs; each pair copies 8 bytes
br x15
ldp w3, w4, [x17, #-56]
stp w3, w4, [x18, #-56]
ldp w5, w6, [x17, #-48]
stp w5, w6, [x18, #-48]
ldp w7, w8, [x17, #-40]
stp w7, w8, [x18, #-40]
ldp w9, w10, [x17, #-32]
stp w9, w10, [x18, #-32]
ldp w11, w12, [x17, #-24]
stp w11, w12, [x18, #-24]
ldp w13, w14, [x17, #-16]
stp w13, w14, [x18, #-16]
ldp w15, w16, [x17, #-8]
stp w15, w16, [x18, #-8]
ci_forward_tail_table_base:
ret
.p2align 6
.rept 12
nop
.endr
ci_forward_large:
# x18 >= 0;
# Copy [x0, x0 + x18 + 64) to [x1, x1 + x18 + 64)
ldp w3, w4, [x0], #32
ldp w5, w6, [x0, #-24]
ldp w7, w8, [x0, #-16]
ldp w9, w10, [x0, #-8]
# Before and after each iteration of loop registers w3-w10 contain [x0 - 32, x0),
# and x1 is a place to copy this data;
# x18 contains number of bytes to be stored minus 64
# Exactly 16 instructions from p2align, so ci_forward_loop starts from cache line boundary
# Checking it explictly by aligning with "hlt 1000" instructions
.p2alignl 6, 0xd4407d00
ci_forward_loop:
prfm pldl1keep, [x0, #32]
prfm pstl1keep, [x1, #32]
subs x18, x18, #32
stp w3, w4, [x1, #0]
ldp w3, w4, [x0, #0]
stp w5, w6, [x1, #8]
ldp w5, w6, [x0, #8]
stp w7, w8, [x1, #16]
ldp w7, w8, [x0, #16]
stp w9, w10, [x1, #24]
ldp w9, w10, [x0, #24]
add x1, x1, #32
add x0, x0, #32
b.ge ci_forward_loop
# 14 instructions from ci_forward_loop, so the loop body hits into one cache line
ci_forward_loop_end:
adds x2, x18, #32
stp w3, w4, [x1], #32
stp w5, w6, [x1, #-24]
stp w7, w8, [x1, #-16]
stp w9, w10, [x1, #-8]
# Increased x18 by 32, but stored 32 bytes, so x2 contains exact number of bytes to be stored
# If this number is not zero, also copy remaining bytes
b.ne ci_forward_lt_64
ret
ci_backward:
# Overlapping case should be the rare one, it does not worth optimizing
ands x3, x2, #~4
# x3 is count aligned down by 2*jintSize
add x0, x0, x2
add x1, x1, x2
sub x3, x3, #8
# Skip loop if 0 or 1 jints
b.eq ci_backward_loop_end
# x3 >= 0
# Copy [x0 - x3 - 8, x0) to [x1 - x3 - 8, x1) backward
ci_backward_loop:
subs x3, x3, #8
ldp w4, w5, [x0, #-8]!
stp w4, w5, [x1, #-8]!
b.ge ci_backward_loop
ci_backward_loop_end:
# Copy remaining 0 or 1 jints
tbz x2, #2, ci_backward_finish
ldr w3, [x0, #-4]
str w3, [x1, #-4]
ci_backward_finish:
ret