hotspot/src/os_cpu/linux_arm/vm/linux_arm_32.s
author vdeshpande
Wed, 07 Jun 2017 13:09:46 -0700
changeset 46528 cf0da758e7b5
parent 42664 29142a56c193
permissions -rw-r--r--
8181616: FMA Vectorization on x86 Reviewed-by: kvn

# 
# Copyright (c) 2008, 2013, Oracle and/or its affiliates. All rights reserved.
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
#
# This code is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License version 2 only, as
# published by the Free Software Foundation.
#
# This code is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
# version 2 for more details (a copy is included in the LICENSE file that
# accompanied this code).
#
# You should have received a copy of the GNU General Public License version
# 2 along with this work; if not, write to the Free Software Foundation,
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
#
# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
# or visit www.oracle.com if you need additional information or have any
# questions.
# 

	
        # NOTE WELL!  The _Copy functions are called directly
	# from server-compiler-generated code via CallLeafNoFP,
	# which means that they *must* either not use floating
	# point or use it in the same manner as does the server
	# compiler.
	
        .globl _Copy_conjoint_bytes
	.type _Copy_conjoint_bytes, %function
        .globl _Copy_arrayof_conjoint_bytes
	.type _Copy_arrayof_conjoint_bytes, %function
	.globl _Copy_disjoint_words
	.type _Copy_disjoint_words, %function
	.globl _Copy_conjoint_words
	.type _Copy_conjoint_words, %function
        .globl _Copy_conjoint_jshorts_atomic
	.type _Copy_conjoint_jshorts_atomic, %function
	.globl _Copy_arrayof_conjoint_jshorts
	.type _Copy_arrayof_conjoint_jshorts, %function
        .globl _Copy_conjoint_jints_atomic
	.type _Copy_conjoint_jints_atomic, %function
        .globl _Copy_arrayof_conjoint_jints
	.type _Copy_arrayof_conjoint_jints, %function
	.globl _Copy_conjoint_jlongs_atomic
	.type _Copy_conjoint_jlongs_atomic, %function
	.globl _Copy_arrayof_conjoint_jlongs
	.type _Copy_arrayof_conjoint_jlongs, %function

	.text
        .globl  SpinPause
        .type SpinPause, %function
SpinPause:
        bx      LR

        # Support for void Copy::conjoint_bytes(void* from,
        #                                       void* to,
        #                                       size_t count)
_Copy_conjoint_bytes:
        swi     0x9f0001

        # Support for void Copy::arrayof_conjoint_bytes(void* from,
        #                                               void* to,
        #                                               size_t count)
_Copy_arrayof_conjoint_bytes:
        swi     0x9f0001


        # Support for void Copy::disjoint_words(void* from,
        #                                       void* to,
        #                                       size_t count)
_Copy_disjoint_words:
        stmdb    sp!, {r3 - r9, ip}
 
        cmp     r2, #0
        beq     disjoint_words_finish

        pld     [r1, #0]
        cmp     r2, #12
        ble disjoint_words_small

        .align 3
dw_f2b_loop_32:
        subs    r2, #32
	blt	dw_f2b_loop_32_finish
        ldmia r1!, {r3 - r9, ip}
        nop
	pld     [r1]
        stmia r0!, {r3 - r9, ip}
        bgt     dw_f2b_loop_32
dw_f2b_loop_32_finish:
        addlts  r2, #32
        beq     disjoint_words_finish
        cmp     r2, #16
	blt	disjoint_words_small
        ldmia r1!, {r3 - r6}
        subge   r2, r2, #16
        stmia r0!, {r3 - r6}
        beq     disjoint_words_finish
disjoint_words_small:
        cmp     r2, #8
        ldr     r7, [r1], #4
        ldrge   r8, [r1], #4
        ldrgt   r9, [r1], #4
        str     r7, [r0], #4
        strge   r8, [r0], #4
        strgt   r9, [r0], #4

disjoint_words_finish:
        ldmia   sp!, {r3 - r9, ip}
        bx      lr


        # Support for void Copy::conjoint_words(void* from,
        #                                       void* to,
        #                                       size_t count)
_Copy_conjoint_words:
        stmdb    sp!, {r3 - r9, ip}

	cmp	r2, #0
	beq	conjoint_words_finish

        pld     [r1, #0]
        cmp     r2, #12
        ble conjoint_words_small

        subs    r3, r0, r1
        cmphi   r2, r3
        bhi     cw_b2f_copy
        .align 3
cw_f2b_loop_32:
        subs    r2, #32
	blt	cw_f2b_loop_32_finish
        ldmia r1!, {r3 - r9, ip}
        nop
	pld     [r1]
        stmia r0!, {r3 - r9, ip}
        bgt     cw_f2b_loop_32
cw_f2b_loop_32_finish:
        addlts  r2, #32
        beq     conjoint_words_finish
        cmp     r2, #16
	blt	conjoint_words_small
        ldmia r1!, {r3 - r6}
        subge   r2, r2, #16
        stmia r0!, {r3 - r6}
        beq     conjoint_words_finish
conjoint_words_small:
        cmp     r2, #8
        ldr     r7, [r1], #4
        ldrge   r8, [r1], #4
        ldrgt   r9, [r1], #4
        str     r7, [r0], #4
        strge   r8, [r0], #4
        strgt   r9, [r0], #4
        b       conjoint_words_finish

	# Src and dest overlap, copy in a descending order
cw_b2f_copy:
        add     r1, r2
        pld     [r1, #-32]
        add     r0, r2
        .align 3
cw_b2f_loop_32:
        subs    r2, #32
	blt	cw_b2f_loop_32_finish
        ldmdb r1!, {r3-r9,ip}
        nop
	pld     [r1, #-32]
        stmdb r0!, {r3-r9,ip}
        bgt     cw_b2f_loop_32
cw_b2f_loop_32_finish:
        addlts  r2, #32
        beq     conjoint_words_finish
        cmp     r2, #16
	blt	cw_b2f_copy_small
        ldmdb r1!, {r3 - r6}
        subge   r2, r2, #16
        stmdb r0!, {r3 - r6}
        beq     conjoint_words_finish
cw_b2f_copy_small:
        cmp     r2, #8
        ldr     r7, [r1, #-4]!
        ldrge   r8, [r1, #-4]!
        ldrgt   r9, [r1, #-4]!
        str     r7, [r0, #-4]!
        strge   r8, [r0, #-4]!
        strgt   r9, [r0, #-4]!

conjoint_words_finish:
        ldmia   sp!, {r3 - r9, ip}
        bx      lr

        # Support for void Copy::conjoint_jshorts_atomic(void* from,
        #                                                void* to,
        #                                                size_t count)
_Copy_conjoint_jshorts_atomic:
        stmdb   sp!, {r3 - r9, ip}

	cmp	r2, #0
	beq	conjoint_shorts_finish	

        subs    r3, r0, r1
        cmphi   r2, r3
        bhi     cs_b2f_copy

        pld     [r1]

        ands    r3, r0, #3
        bne     cs_f2b_dest_u
        ands    r3, r1, #3
        bne     cs_f2b_src_u

	# Aligned source address
        .align 3
cs_f2b_loop_32:
        subs    r2, #32
	blt	cs_f2b_loop_32_finish
        ldmia r1!, {r3 - r9, ip}
        nop
        pld     [r1]
        stmia r0!, {r3 - r9, ip}
        bgt     cs_f2b_loop_32
cs_f2b_loop_32_finish:
        addlts  r2, #32
        beq     conjoint_shorts_finish
        movs    r6, r2, lsr #3
        .align 3
cs_f2b_8_loop:
        beq     cs_f2b_4
        ldmia   r1!, {r4-r5}
        subs    r6, #1
        stmia   r0!, {r4-r5}
        bgt     cs_f2b_8_loop

cs_f2b_4:
        ands    r2, #7
        beq     conjoint_shorts_finish
        cmp     r2, #4
        ldrh    r3, [r1], #2
        ldrgeh  r4, [r1], #2
        ldrgth  r5, [r1], #2
        strh    r3, [r0], #2
        strgeh  r4, [r0], #2
        strgth  r5, [r0], #2
        b       conjoint_shorts_finish

	# Destination not aligned
cs_f2b_dest_u:
        ldrh    r3, [r1], #2
        subs    r2, #2
        strh    r3, [r0], #2
        beq     conjoint_shorts_finish

	# Check to see if source is not aligned ether
        ands    r3, r1, #3
        beq     cs_f2b_loop_32

cs_f2b_src_u:
        cmp     r2, #16
        blt     cs_f2b_8_u

	# Load 2 first bytes to r7 and make src ptr word aligned
        bic     r1, #3
        ldr     r7, [r1], #4

	# Destination aligned, source not
        mov     r8, r2, lsr #4
        .align 3
cs_f2b_16_u_loop:
        mov     r3, r7, lsr #16
        ldmia   r1!, {r4 - r7}
        orr     r3, r3, r4, lsl #16
        mov     r4, r4, lsr #16
        pld     [r1]
        orr     r4, r4, r5, lsl #16
        mov     r5, r5, lsr #16
        orr     r5, r5, r6, lsl #16
        mov     r6, r6, lsr #16
        orr     r6, r6, r7, lsl #16
        stmia   r0!, {r3 - r6}
        subs    r8, #1
        bgt     cs_f2b_16_u_loop
        ands    r2, #0xf
        beq     conjoint_shorts_finish
        sub     r1, #2

cs_f2b_8_u:
        cmp     r2, #8
        blt     cs_f2b_4_u
        ldrh    r4, [r1], #2
        ldr     r5, [r1], #4
        ldrh    r6, [r1], #2
        orr     r4, r4, r5, lsl #16
        mov     r5, r5, lsr #16
        orr     r5, r5, r6, lsl #16
        subs    r2, #8
        stmia	r0!, {r4 - r5}
cs_f2b_4_u:
        beq     conjoint_shorts_finish
        cmp     r2, #4
        ldrh    r3, [r1], #2
        ldrgeh  r4, [r1], #2
        ldrgth  r5, [r1], #2
        strh    r3, [r0], #2
        strgeh  r4, [r0], #2
        strgth  r5, [r0], #2
        b       conjoint_shorts_finish

	# Src and dest overlap, copy in a descending order
cs_b2f_copy:
        add     r1, r2
        pld     [r1, #-32]
        add     r0, r2

        ands    r3, r0, #3
        bne     cs_b2f_dest_u
        ands    r3, r1, #3
        bne     cs_b2f_src_u
        .align 3
cs_b2f_loop_32:
        subs    r2, #32
	blt	cs_b2f_loop_32_finish
        ldmdb r1!, {r3-r9,ip}
        nop
        pld     [r1, #-32]
        stmdb r0!, {r3-r9,ip}
        bgt     cs_b2f_loop_32
cs_b2f_loop_32_finish:
        addlts  r2, #32
        beq     conjoint_shorts_finish
        cmp     r2, #24
        blt     cs_b2f_16
        ldmdb   r1!, {r3-r8}
        sub     r2, #24
        stmdb   r0!, {r3-r8}
        beq     conjoint_shorts_finish
cs_b2f_16:
        cmp     r2, #16
        blt     cs_b2f_8
        ldmdb   r1!, {r3-r6}
        sub     r2, #16
        stmdb   r0!, {r3-r6}
        beq     conjoint_shorts_finish
cs_b2f_8:
        cmp     r2, #8
        blt     cs_b2f_all_copy
        ldmdb   r1!, {r3-r4}
        sub     r2, #8
        stmdb   r0!, {r3-r4}
        beq     conjoint_shorts_finish

cs_b2f_all_copy:
        cmp     r2, #4
        ldrh    r3, [r1, #-2]!
        ldrgeh  r4, [r1, #-2]!
        ldrgth  r5, [r1, #-2]!
        strh    r3, [r0, #-2]!
        strgeh  r4, [r0, #-2]!
        strgth  r5, [r0, #-2]!
        b       conjoint_shorts_finish

	# Destination not aligned
cs_b2f_dest_u:
        ldrh    r3, [r1, #-2]!
        strh    r3, [r0, #-2]!
        sub     r2, #2
	# Check source alignment as well
        ands    r3, r1, #3
        beq     cs_b2f_loop_32

	# Source not aligned
cs_b2f_src_u:
        bic     r1, #3
        .align 3
cs_b2f_16_loop_u:
        subs    r2, #16
        blt     cs_b2f_16_loop_u_finished
        ldr     r7, [r1]
        mov     r3, r7
        ldmdb   r1!, {r4 - r7}
        mov     r4, r4, lsr #16
        orr     r4, r4, r5, lsl #16
        pld     [r1, #-32]
        mov     r5, r5, lsr #16
        orr     r5, r5, r6, lsl #16
        mov     r6, r6, lsr #16
        orr     r6, r6, r7, lsl #16
        mov     r7, r7, lsr #16
        orr     r7, r7, r3, lsl #16
        stmdb   r0!, {r4 - r7}
        bgt     cs_b2f_16_loop_u
        beq     conjoint_shorts_finish
cs_b2f_16_loop_u_finished:
        addlts  r2, #16
        ldr     r3, [r1]
	cmp     r2, #10
        blt     cs_b2f_2_u_loop
        ldmdb   r1!, {r4 - r5}
        mov     r6, r4, lsr #16
        orr     r6, r6, r5, lsl #16
        mov     r7, r5, lsr #16
        orr     r7, r7, r3, lsl #16
        stmdb   r0!, {r6-r7}
        sub     r2, #8
	.align 3
cs_b2f_2_u_loop:
        subs    r2, #2
        ldrh    r3, [r1], #-2
        strh    r3, [r0, #-2]!
        bgt     cs_b2f_2_u_loop

conjoint_shorts_finish:
        ldmia   sp!, {r3 - r9, ip}
        bx      lr


        # Support for void Copy::arrayof_conjoint_jshorts(void* from,
        #                                                 void* to,
        #                                                 size_t count)
_Copy_arrayof_conjoint_jshorts:
        swi     0x9f0001

        # Support for void Copy::conjoint_jints_atomic(void* from,
        #                                              void* to,
        #                                              size_t count)
_Copy_conjoint_jints_atomic:
_Copy_arrayof_conjoint_jints:
        swi     0x9f0001
	
        # Support for void Copy::conjoint_jlongs_atomic(jlong* from,
        #                                               jlong* to,
        #                                               size_t count)
_Copy_conjoint_jlongs_atomic:
_Copy_arrayof_conjoint_jlongs:
        stmdb    sp!, {r3 - r9, ip}

	cmp	r2, #0
	beq	conjoint_longs_finish

        pld     [r1, #0]
        cmp     r2, #24
        ble conjoint_longs_small

        subs    r3, r0, r1
        cmphi   r2, r3
        bhi     cl_b2f_copy
        .align 3
cl_f2b_loop_32:
        subs    r2, #32
	blt	cl_f2b_loop_32_finish
        ldmia r1!, {r3 - r9, ip}
        nop
	pld     [r1]
        stmia r0!, {r3 - r9, ip}
        bgt     cl_f2b_loop_32
cl_f2b_loop_32_finish:
        addlts  r2, #32
        beq     conjoint_longs_finish
conjoint_longs_small:
        cmp     r2, #16
	blt	cl_f2b_copy_8
	bgt	cl_f2b_copy_24
        ldmia 	r1!, {r3 - r6}
        stmia 	r0!, {r3 - r6}
	b	conjoint_longs_finish
cl_f2b_copy_8:
        ldmia   r1!, {r3 - r4}
        stmia   r0!, {r3 - r4}
        b       conjoint_longs_finish
cl_f2b_copy_24:
	ldmia   r1!, {r3 - r8}
        stmia   r0!, {r3 - r8}
        b       conjoint_longs_finish

	# Src and dest overlap, copy in a descending order
cl_b2f_copy:
        add     r1, r2
        pld     [r1, #-32]
        add     r0, r2
        .align 3
cl_b2f_loop_32:
        subs    r2, #32
	blt	cl_b2f_loop_32_finish
        ldmdb 	r1!, {r3 - r9, ip}
        nop
	pld     [r1]
        stmdb 	r0!, {r3 - r9, ip}
        bgt     cl_b2f_loop_32
cl_b2f_loop_32_finish:
        addlts  r2, #32
        beq     conjoint_longs_finish
        cmp     r2, #16
	blt	cl_b2f_copy_8
	bgt	cl_b2f_copy_24
        ldmdb   r1!, {r3 - r6}
        stmdb   r0!, {r3 - r6}
        b       conjoint_longs_finish
cl_b2f_copy_8:
	ldmdb   r1!, {r3 - r4}
        stmdb   r0!, {r3 - r4}
        b       conjoint_longs_finish
cl_b2f_copy_24:
	ldmdb   r1!, {r3 - r8}
        stmdb   r0!, {r3 - r8}

conjoint_longs_finish:
        ldmia   sp!, {r3 - r9, ip}
        bx      lr