hotspot/src/os_cpu/solaris_x86/vm/solaris_x86_64.s
author vdeshpande
Wed, 07 Jun 2017 13:09:46 -0700
changeset 46528 cf0da758e7b5
parent 22234 da823d78ad65
permissions -rw-r--r--
8181616: FMA Vectorization on x86 Reviewed-by: kvn

/
/ Copyright (c) 2004, 2013, Oracle and/or its affiliates. All rights reserved.
/ DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
/
/ This code is free software; you can redistribute it and/or modify it
/ under the terms of the GNU General Public License version 2 only, as
/ published by the Free Software Foundation.
/
/ This code is distributed in the hope that it will be useful, but WITHOUT
/ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
/ FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
/ version 2 for more details (a copy is included in the LICENSE file that
/ accompanied this code).
/
/ You should have received a copy of the GNU General Public License version
/ 2 along with this work; if not, write to the Free Software Foundation,
/ Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
/
/ Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
/ or visit www.oracle.com if you need additional information or have any
/ questions.
/

        .globl fs_load
        .globl fs_thread

        // NOTE WELL!  The _Copy functions are called directly
        // from server-compiler-generated code via CallLeafNoFP,
        // which means that they *must* either not use floating
        // point or use it in the same manner as does the server
        // compiler.

        .globl _Copy_arrayof_conjoint_bytes
        .globl _Copy_conjoint_jshorts_atomic
        .globl _Copy_arrayof_conjoint_jshorts
        .globl _Copy_conjoint_jints_atomic
        .globl _Copy_arrayof_conjoint_jints
        .globl _Copy_conjoint_jlongs_atomic
        .globl _Copy_arrayof_conjoint_jlongs

        .section .text,"ax"

        / Fast thread accessors, used by threadLS_solaris_amd64.cpp
        .align   16
fs_load:
        movq %fs:(%rdi),%rax
        ret

        .align   16
fs_thread:
        movq %fs:0x0,%rax
        ret

        .globl  SpinPause
        .align  16
SpinPause:
        rep
        nop
        movq    $1, %rax
        ret


        / Support for void Copy::arrayof_conjoint_bytes(void* from,
        /                                               void* to,
        /                                               size_t count)
        / rdi - from
        / rsi - to
        / rdx - count, treated as ssize_t
        /
        .align   16
_Copy_arrayof_conjoint_bytes:
        movq     %rdx,%r8             / byte count
        shrq     $3,%rdx              / qword count
        cmpq     %rdi,%rsi
        leaq     -1(%rdi,%r8,1),%rax  / from + bcount*1 - 1
        jbe      acb_CopyRight
        cmpq     %rax,%rsi
        jbe      acb_CopyLeft 
acb_CopyRight:
        leaq     -8(%rdi,%rdx,8),%rax / from + qcount*8 - 8
        leaq     -8(%rsi,%rdx,8),%rcx / to + qcount*8 - 8
        negq     %rdx
        jmp      7f
        .align   16
1:      movq     8(%rax,%rdx,8),%rsi
        movq     %rsi,8(%rcx,%rdx,8)
        addq     $1,%rdx
        jnz      1b
2:      testq    $4,%r8               / check for trailing dword
        jz       3f
        movl     8(%rax),%esi         / copy trailing dword
        movl     %esi,8(%rcx)
        addq     $4,%rax
        addq     $4,%rcx              / original %rsi is trashed, so we
                                      /  can't use it as a base register
3:      testq    $2,%r8               / check for trailing word
        jz       4f
        movw     8(%rax),%si          / copy trailing word
        movw     %si,8(%rcx)
        addq     $2,%rcx
4:      testq    $1,%r8               / check for trailing byte
        jz       5f
        movb     -1(%rdi,%r8,1),%al   / copy trailing byte
        movb     %al,8(%rcx)
5:      ret
        .align   16
6:      movq     -24(%rax,%rdx,8),%rsi
        movq     %rsi,-24(%rcx,%rdx,8)
        movq     -16(%rax,%rdx,8),%rsi
        movq     %rsi,-16(%rcx,%rdx,8)
        movq     -8(%rax,%rdx,8),%rsi
        movq     %rsi,-8(%rcx,%rdx,8)
        movq     (%rax,%rdx,8),%rsi
        movq     %rsi,(%rcx,%rdx,8)
7:      addq     $4,%rdx
        jle      6b
        subq     $4,%rdx
        jl       1b
        jmp      2b
acb_CopyLeft:
        testq    $1,%r8               / check for trailing byte
        jz       1f
        movb     -1(%rdi,%r8,1),%cl   / copy trailing byte
        movb     %cl,-1(%rsi,%r8,1)
        subq     $1,%r8               / adjust for possible trailing word
1:      testq    $2,%r8               / check for trailing word
        jz       2f
        movw     -2(%rdi,%r8,1),%cx   / copy trailing word
        movw     %cx,-2(%rsi,%r8,1)
2:      testq    $4,%r8               / check for trailing dword
        jz       5f
        movl     (%rdi,%rdx,8),%ecx   / copy trailing dword
        movl     %ecx,(%rsi,%rdx,8)
        jmp      5f
        .align   16
3:      movq     -8(%rdi,%rdx,8),%rcx
        movq     %rcx,-8(%rsi,%rdx,8)
        subq     $1,%rdx
        jnz      3b
        ret
        .align   16
4:      movq     24(%rdi,%rdx,8),%rcx
        movq     %rcx,24(%rsi,%rdx,8)
        movq     16(%rdi,%rdx,8),%rcx
        movq     %rcx,16(%rsi,%rdx,8)
        movq     8(%rdi,%rdx,8),%rcx
        movq     %rcx,8(%rsi,%rdx,8)
        movq     (%rdi,%rdx,8),%rcx
        movq     %rcx,(%rsi,%rdx,8)
5:      subq     $4,%rdx
        jge      4b
        addq     $4,%rdx
        jg       3b
        ret

        / Support for void Copy::arrayof_conjoint_jshorts(void* from,
        /                                                 void* to,
        /                                                 size_t count)
        / Equivalent to
        /   conjoint_jshorts_atomic
        /
        / If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
        / let the hardware handle it.  The tow or four words within dwords
        / or qwords that span cache line boundaries will still be loaded
        / and stored atomically.
        /
        / rdi - from
        / rsi - to
        / rdx - count, treated as ssize_t
        /
        .align   16
_Copy_arrayof_conjoint_jshorts:
_Copy_conjoint_jshorts_atomic:
        movq     %rdx,%r8             / word count
        shrq     $2,%rdx              / qword count
        cmpq     %rdi,%rsi
        leaq     -2(%rdi,%r8,2),%rax  / from + wcount*2 - 2
        jbe      acs_CopyRight
        cmpq     %rax,%rsi
        jbe      acs_CopyLeft 
acs_CopyRight:
        leaq     -8(%rdi,%rdx,8),%rax / from + qcount*8 - 8
        leaq     -8(%rsi,%rdx,8),%rcx / to + qcount*8 - 8
        negq     %rdx
        jmp      6f
1:      movq     8(%rax,%rdx,8),%rsi
        movq     %rsi,8(%rcx,%rdx,8)
        addq     $1,%rdx
        jnz      1b
2:      testq    $2,%r8               / check for trailing dword
        jz       3f
        movl     8(%rax),%esi         / copy trailing dword
        movl     %esi,8(%rcx)
        addq     $4,%rcx              / original %rsi is trashed, so we
                                      /  can't use it as a base register
3:      testq    $1,%r8               / check for trailing word
        jz       4f
        movw     -2(%rdi,%r8,2),%si   / copy trailing word
        movw     %si,8(%rcx)
4:      ret
        .align   16
5:      movq     -24(%rax,%rdx,8),%rsi
        movq     %rsi,-24(%rcx,%rdx,8)
        movq     -16(%rax,%rdx,8),%rsi
        movq     %rsi,-16(%rcx,%rdx,8)
        movq     -8(%rax,%rdx,8),%rsi
        movq     %rsi,-8(%rcx,%rdx,8)
        movq     (%rax,%rdx,8),%rsi
        movq     %rsi,(%rcx,%rdx,8)
6:      addq     $4,%rdx
        jle      5b
        subq     $4,%rdx
        jl       1b
        jmp      2b
acs_CopyLeft:
        testq    $1,%r8               / check for trailing word
        jz       1f
        movw     -2(%rdi,%r8,2),%cx   / copy trailing word
        movw     %cx,-2(%rsi,%r8,2)
1:      testq    $2,%r8               / check for trailing dword
        jz       4f
        movl     (%rdi,%rdx,8),%ecx   / copy trailing dword
        movl     %ecx,(%rsi,%rdx,8)
        jmp      4f
2:      movq     -8(%rdi,%rdx,8),%rcx
        movq     %rcx,-8(%rsi,%rdx,8)
        subq     $1,%rdx
        jnz      2b
        ret
        .align   16
3:      movq     24(%rdi,%rdx,8),%rcx
        movq     %rcx,24(%rsi,%rdx,8)
        movq     16(%rdi,%rdx,8),%rcx
        movq     %rcx,16(%rsi,%rdx,8)
        movq     8(%rdi,%rdx,8),%rcx
        movq     %rcx,8(%rsi,%rdx,8)
        movq     (%rdi,%rdx,8),%rcx
        movq     %rcx,(%rsi,%rdx,8)
4:      subq     $4,%rdx
        jge      3b
        addq     $4,%rdx
        jg       2b
        ret

        / Support for void Copy::arrayof_conjoint_jints(jint* from,
        /                                               jint* to,
        /                                               size_t count)
        / Equivalent to
        /   conjoint_jints_atomic
        /
        / If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
        / the hardware handle it.  The two dwords within qwords that span
        / cache line boundaries will still be loaded and stored atomically.
        /
        / rdi - from
        / rsi - to
        / rdx - count, treated as ssize_t
        /
        .align   16
_Copy_arrayof_conjoint_jints:
_Copy_conjoint_jints_atomic:
        movq     %rdx,%r8             / dword count
        shrq     %rdx                 / qword count
        cmpq     %rdi,%rsi
        leaq     -4(%rdi,%r8,4),%rax  / from + dcount*4 - 4
        jbe      aci_CopyRight
        cmpq     %rax,%rsi
        jbe      aci_CopyLeft 
aci_CopyRight:
        leaq     -8(%rdi,%rdx,8),%rax / from + qcount*8 - 8
        leaq     -8(%rsi,%rdx,8),%rcx / to + qcount*8 - 8
        negq     %rdx
        jmp      5f
        .align   16
1:      movq     8(%rax,%rdx,8),%rsi
        movq     %rsi,8(%rcx,%rdx,8)
        addq     $1,%rdx
        jnz       1b
2:      testq    $1,%r8               / check for trailing dword
        jz       3f
        movl     8(%rax),%esi         / copy trailing dword
        movl     %esi,8(%rcx)
3:      ret
        .align   16
4:      movq     -24(%rax,%rdx,8),%rsi
        movq     %rsi,-24(%rcx,%rdx,8)
        movq     -16(%rax,%rdx,8),%rsi
        movq     %rsi,-16(%rcx,%rdx,8)
        movq     -8(%rax,%rdx,8),%rsi
        movq     %rsi,-8(%rcx,%rdx,8)
        movq     (%rax,%rdx,8),%rsi
        movq     %rsi,(%rcx,%rdx,8)
5:      addq     $4,%rdx
        jle      4b
        subq     $4,%rdx
        jl       1b
        jmp      2b
aci_CopyLeft:
        testq    $1,%r8               / check for trailing dword
        jz       3f
        movl     -4(%rdi,%r8,4),%ecx  / copy trailing dword
        movl     %ecx,-4(%rsi,%r8,4)
        jmp      3f
1:      movq     -8(%rdi,%rdx,8),%rcx
        movq     %rcx,-8(%rsi,%rdx,8)
        subq     $1,%rdx
        jnz      1b
        ret
        .align   16
2:      movq     24(%rdi,%rdx,8),%rcx
        movq     %rcx,24(%rsi,%rdx,8)
        movq     16(%rdi,%rdx,8),%rcx
        movq     %rcx,16(%rsi,%rdx,8)
        movq     8(%rdi,%rdx,8),%rcx
        movq     %rcx,8(%rsi,%rdx,8)
        movq     (%rdi,%rdx,8),%rcx
        movq     %rcx,(%rsi,%rdx,8)
3:      subq     $4,%rdx
        jge      2b
        addq     $4,%rdx
        jg       1b
        ret

        / Support for void Copy::arrayof_conjoint_jlongs(jlong* from,
        /                                                jlong* to,
        /                                                size_t count)
        / Equivalent to
        /   conjoint_jlongs_atomic
        /   arrayof_conjoint_oops
        /   conjoint_oops_atomic
        /
        / rdi - from
        / rsi - to
        / rdx - count, treated as ssize_t
        /
        .align   16
_Copy_arrayof_conjoint_jlongs:
_Copy_conjoint_jlongs_atomic:
        cmpq     %rdi,%rsi
        leaq     -8(%rdi,%rdx,8),%rax / from + count*8 - 8
        jbe      acl_CopyRight
        cmpq     %rax,%rsi
        jbe      acl_CopyLeft 
acl_CopyRight:
        leaq     -8(%rsi,%rdx,8),%rcx / to + count*8 - 8
        negq     %rdx
        jmp      3f
1:      movq     8(%rax,%rdx,8),%rsi
        movq     %rsi,8(%rcx,%rdx,8)
        addq     $1,%rdx
        jnz      1b
        ret
        .align   16
2:      movq     -24(%rax,%rdx,8),%rsi
        movq     %rsi,-24(%rcx,%rdx,8)
        movq     -16(%rax,%rdx,8),%rsi
        movq     %rsi,-16(%rcx,%rdx,8)
        movq     -8(%rax,%rdx,8),%rsi
        movq     %rsi,-8(%rcx,%rdx,8)
        movq     (%rax,%rdx,8),%rsi
        movq     %rsi,(%rcx,%rdx,8)
3:      addq     $4,%rdx
        jle      2b
        subq     $4,%rdx
        jl       1b
        ret
4:      movq     -8(%rdi,%rdx,8),%rcx
        movq     %rcx,-8(%rsi,%rdx,8)
        subq     $1,%rdx
        jnz      4b
        ret
        .align   16
5:      movq     24(%rdi,%rdx,8),%rcx
        movq     %rcx,24(%rsi,%rdx,8)
        movq     16(%rdi,%rdx,8),%rcx
        movq     %rcx,16(%rsi,%rdx,8)
        movq     8(%rdi,%rdx,8),%rcx
        movq     %rcx,8(%rsi,%rdx,8)
        movq     (%rdi,%rdx,8),%rcx
        movq     %rcx,(%rsi,%rdx,8)
acl_CopyLeft:
        subq     $4,%rdx
        jge      5b
        addq     $4,%rdx
        jg       4b
        ret