hotspot/src/os_cpu/solaris_x86/vm/solaris_x86_32.s
author vdeshpande
Wed, 07 Jun 2017 13:09:46 -0700
changeset 46528 cf0da758e7b5
parent 46414 5d1d862dc8d8
permissions -rw-r--r--
8181616: FMA Vectorization on x86 Reviewed-by: kvn

//
// Copyright (c) 2004, 2017, Oracle and/or its affiliates. All rights reserved.
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
//
// This code is free software; you can redistribute it and/or modify it
// under the terms of the GNU General Public License version 2 only, as
// published by the Free Software Foundation.
//
// This code is distributed in the hope that it will be useful, but WITHOUT
// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
// version 2 for more details (a copy is included in the LICENSE file that
// accompanied this code).
//
// You should have received a copy of the GNU General Public License version
// 2 along with this work; if not, write to the Free Software Foundation,
// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
//
// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
// or visit www.oracle.com if you need additional information or have any
// questions.
//

	.globl fixcw
	.globl sse_check
	.globl sse_unavailable
	.globl gs_load
	.globl gs_thread
        .globl _Atomic_cmpxchg_long_gcc

        // NOTE WELL!  The _Copy functions are called directly
	// from server-compiler-generated code via CallLeafNoFP,
	// which means that they *must* either not use floating
	// point or use it in the same manner as does the server
	// compiler.

        .globl _Copy_conjoint_bytes
        .globl _Copy_arrayof_conjoint_bytes
        .globl _Copy_conjoint_jshorts_atomic
	.globl _Copy_arrayof_conjoint_jshorts
        .globl _Copy_conjoint_jints_atomic
        .globl _Copy_arrayof_conjoint_jints
	.globl _Copy_conjoint_jlongs_atomic
        .globl _mmx_Copy_arrayof_conjoint_jshorts

	.section .text,"ax"

/ Support for void os::Solaris::init_thread_fpu_state() in os_solaris_i486.cpp
/ Set fpu to 53 bit precision.  This happens too early to use a stub.
	.align   16
fixcw:
	pushl    $0x27f
	fldcw    0(%esp)
	popl     %eax
	ret

        .align  16
        .globl  SpinPause
SpinPause:
        rep 
        nop
        movl    $1, %eax
        ret


/ Test SSE availability, used by os_solaris_i486.cpp
	.align   16
sse_check:
	/ Fault if SSE not available
	xorps %xmm0,%xmm0
	/ No fault
	movl $1,%eax
	ret
	/ Signal handler continues here if SSE is not available
sse_unavailable:
	xorl %eax,%eax
	ret

/ Fast thread accessors, used by threadLS_solaris_i486.cpp
	.align   16
gs_load:
	movl 4(%esp),%ecx
	movl %gs:(%ecx),%eax
	ret

	.align   16
gs_thread:
	movl %gs:0x0,%eax
	ret

        / Support for void Copy::conjoint_bytes(void* from,
        /                                       void* to,
        /                                       size_t count)
        .align   16
_Copy_conjoint_bytes:
        pushl    %esi
        movl     4+12(%esp),%ecx      / count
        pushl    %edi
        movl     8+ 4(%esp),%esi      / from
        movl     8+ 8(%esp),%edi      / to
        cmpl     %esi,%edi
        leal     -1(%esi,%ecx),%eax   / from + count - 1
        jbe      cb_CopyRight
        cmpl     %eax,%edi
        jbe      cb_CopyLeft
        / copy from low to high
cb_CopyRight:
        cmpl     $3,%ecx
        jbe      5f                   / <= 3 bytes
        / align source address at dword address boundary
        movl     %ecx,%eax            / original count
        movl     $4,%ecx
        subl     %esi,%ecx
        andl     $3,%ecx              / prefix byte count
        jz       1f                   / no prefix
        subl     %ecx,%eax            / byte count less prefix
        / copy prefix
        subl     %esi,%edi
0:      movb     (%esi),%dl
        movb     %dl,(%edi,%esi,1)
        addl     $1,%esi
        subl     $1,%ecx
        jnz      0b
        addl     %esi,%edi
1:      movl     %eax,%ecx            / byte count less prefix
        shrl     $2,%ecx              / dword count
        jz       4f                   / no dwords to move
        cmpl     $32,%ecx
        jbe      2f                   / <= 32 dwords
        / copy aligned dwords
        rep;     smovl
        jmp      4f
        / copy aligned dwords
2:      subl     %esi,%edi
        .align   16
3:      movl     (%esi),%edx
        movl     %edx,(%edi,%esi,1)
        addl     $4,%esi
        subl     $1,%ecx
        jnz      3b
        addl     %esi,%edi
4:      movl     %eax,%ecx            / byte count less prefix
5:      andl     $3,%ecx              / suffix byte count
        jz       7f                   / no suffix
        / copy suffix
        xorl     %eax,%eax
6:      movb     (%esi,%eax,1),%dl
        movb     %dl,(%edi,%eax,1)
        addl     $1,%eax
        subl     $1,%ecx
        jnz      6b
7:      popl     %edi
        popl     %esi
        ret
        / copy from high to low
cb_CopyLeft:
        std
        leal     -4(%edi,%ecx),%edi   / to + count - 4
        movl     %eax,%esi            / from + count - 1
        movl     %ecx,%eax
        subl     $3,%esi              / from + count - 4
        cmpl     $3,%ecx
        jbe      5f                   / <= 3 bytes
1:      shrl     $2,%ecx              / dword count
        jz       4f                   / no dwords to move
        cmpl     $32,%ecx
        ja       3f                   / > 32 dwords
        / copy dwords, aligned or not
        subl     %esi,%edi
        .align   16
2:      movl     (%esi),%edx
        movl     %edx,(%edi,%esi,1)
        subl     $4,%esi
        subl     $1,%ecx
        jnz      2b
        addl     %esi,%edi
        jmp      4f
        / copy dwords, aligned or not
3:      rep;     smovl
4:      movl     %eax,%ecx            / byte count
5:      andl     $3,%ecx              / suffix byte count
        jz       7f                   / no suffix
        / copy suffix
        subl     %esi,%edi
        addl     $3,%esi
6:      movb     (%esi),%dl
        movb     %dl,(%edi,%esi,1)
	subl     $1,%esi
        subl     $1,%ecx
        jnz      6b
7:      cld
        popl     %edi
        popl     %esi
        ret

        / Support for void Copy::arrayof_conjoint_bytes(void* from,
        /                                               void* to,
        /                                               size_t count)
        /
        / Same as _Copy_conjoint_bytes, except no source alignment check.
        .align   16
_Copy_arrayof_conjoint_bytes:
        pushl    %esi
        movl     4+12(%esp),%ecx      / count
        pushl    %edi
        movl     8+ 4(%esp),%esi      / from
        movl     8+ 8(%esp),%edi      / to
        cmpl     %esi,%edi
        leal     -1(%esi,%ecx),%eax   / from + count - 1
        jbe      acb_CopyRight
        cmpl     %eax,%edi
        jbe      acb_CopyLeft 
        / copy from low to high
acb_CopyRight:
        cmpl     $3,%ecx
        jbe      5f
1:      movl     %ecx,%eax
        shrl     $2,%ecx
        jz       4f
        cmpl     $32,%ecx
        ja       3f
        / copy aligned dwords
        subl     %esi,%edi
        .align   16
2:      movl     (%esi),%edx
        movl     %edx,(%edi,%esi,1)
        addl     $4,%esi
        subl     $1,%ecx
        jnz      2b
        addl     %esi,%edi
        jmp      4f
        / copy aligned dwords
3:      rep;     smovl
4:      movl     %eax,%ecx
5:      andl     $3,%ecx
        jz       7f
        / copy suffix
        xorl     %eax,%eax
6:      movb     (%esi,%eax,1),%dl
        movb     %dl,(%edi,%eax,1)
        addl     $1,%eax
        subl     $1,%ecx
        jnz      6b
7:      popl     %edi
        popl     %esi
        ret
acb_CopyLeft:
        std
        leal     -4(%edi,%ecx),%edi   / to + count - 4
        movl     %eax,%esi            / from + count - 1
        movl     %ecx,%eax
        subl     $3,%esi              / from + count - 4
        cmpl     $3,%ecx
        jbe      5f
1:      shrl     $2,%ecx
        jz       4f
        cmpl     $32,%ecx
        jbe      2f                   / <= 32 dwords
        rep;     smovl
        jmp      4f
	.=.+8
2:      subl     %esi,%edi
        .align   16
3:      movl     (%esi),%edx
        movl     %edx,(%edi,%esi,1)
        subl     $4,%esi
        subl     $1,%ecx
        jnz      3b
        addl     %esi,%edi
4:      movl     %eax,%ecx
5:      andl     $3,%ecx
        jz       7f
        subl     %esi,%edi
        addl     $3,%esi
6:      movb     (%esi),%dl
        movb     %dl,(%edi,%esi,1)
	subl     $1,%esi
        subl     $1,%ecx
        jnz      6b
7:      cld
        popl     %edi
        popl     %esi
        ret

        / Support for void Copy::conjoint_jshorts_atomic(void* from,
        /                                                void* to,
        /                                                size_t count)
        .align   16
_Copy_conjoint_jshorts_atomic:
        pushl    %esi
        movl     4+12(%esp),%ecx      / count
        pushl    %edi
        movl     8+ 4(%esp),%esi      / from
        movl     8+ 8(%esp),%edi      / to
        cmpl     %esi,%edi
        leal     -2(%esi,%ecx,2),%eax / from + count*2 - 2
        jbe      cs_CopyRight
        cmpl     %eax,%edi
        jbe      cs_CopyLeft 
        / copy from low to high
cs_CopyRight:
        / align source address at dword address boundary
        movl     %esi,%eax            / original from
        andl     $3,%eax              / either 0 or 2
        jz       1f                   / no prefix
        / copy prefix
        subl     $1,%ecx
        jl       5f                   / zero count
        movw     (%esi),%dx
        movw     %dx,(%edi)
        addl     %eax,%esi            / %eax == 2
        addl     %eax,%edi
1:      movl     %ecx,%eax            / word count less prefix
        sarl     %ecx                 / dword count
        jz       4f                   / no dwords to move
        cmpl     $32,%ecx
        jbe      2f                   / <= 32 dwords
        / copy aligned dwords
        rep;     smovl
        jmp      4f 
        / copy aligned dwords
2:      subl     %esi,%edi
        .align   16
3:      movl     (%esi),%edx
        movl     %edx,(%edi,%esi,1)
        addl     $4,%esi
        subl     $1,%ecx
        jnz      3b
        addl     %esi,%edi
4:      andl     $1,%eax              / suffix count
        jz       5f                   / no suffix
        / copy suffix
        movw     (%esi),%dx
        movw     %dx,(%edi)
5:      popl     %edi
        popl     %esi
        ret
        / copy from high to low
cs_CopyLeft:
        std
        leal     -4(%edi,%ecx,2),%edi / to + count*2 - 4
        movl     %eax,%esi            / from + count*2 - 2
        movl     %ecx,%eax
        subl     $2,%esi              / from + count*2 - 4
1:      sarl     %ecx                 / dword count
        jz       4f                   / no dwords to move
        cmpl     $32,%ecx
        ja       3f                   / > 32 dwords
        subl     %esi,%edi
        .align   16
2:      movl     (%esi),%edx
        movl     %edx,(%edi,%esi,1)
        subl     $4,%esi
        subl     $1,%ecx
        jnz      2b
        addl     %esi,%edi
        jmp      4f
3:      rep;     smovl
4:      andl     $1,%eax              / suffix count
        jz       5f                   / no suffix
        / copy suffix
        addl     $2,%esi
        addl     $2,%edi
        movw     (%esi),%dx
        movw     %dx,(%edi)
5:      cld
        popl     %edi
        popl     %esi
        ret

        / Support for void Copy::arrayof_conjoint_jshorts(void* from,
        /                                                 void* to,
        /                                                 size_t count)
        .align   16
_Copy_arrayof_conjoint_jshorts:
        pushl    %esi
        movl     4+12(%esp),%ecx      / count
        pushl    %edi
        movl     8+ 4(%esp),%esi      / from
        movl     8+ 8(%esp),%edi      / to
        cmpl     %esi,%edi
        leal     -2(%esi,%ecx,2),%eax / from + count*2 - 2
        jbe      acs_CopyRight
        cmpl     %eax,%edi
        jbe      acs_CopyLeft 
acs_CopyRight:
        movl     %ecx,%eax            / word count
        sarl     %ecx                 / dword count
        jz       4f                   / no dwords to move
        cmpl     $32,%ecx
        jbe      2f                   / <= 32 dwords
        / copy aligned dwords
        rep;     smovl 	 
        jmp      4f 
        / copy aligned dwords
        .=.+5
2:      subl     %esi,%edi 
        .align   16	
3:      movl     (%esi),%edx
        movl     %edx,(%edi,%esi,1)
        addl     $4,%esi
        subl     $1,%ecx
        jnz      3b
        addl     %esi,%edi
4:      andl     $1,%eax              / suffix count
        jz       5f                   / no suffix
        / copy suffix
        movw     (%esi),%dx
        movw     %dx,(%edi)
5:      popl     %edi
        popl     %esi
        ret
acs_CopyLeft:
        std
        leal     -4(%edi,%ecx,2),%edi / to + count*2 - 4
        movl     %eax,%esi            / from + count*2 - 2
        movl     %ecx,%eax
        subl     $2,%esi              / from + count*2 - 4
        sarl     %ecx                 / dword count
        jz       4f                   / no dwords to move
        cmpl     $32,%ecx
        ja       3f                   / > 32 dwords
        subl     %esi,%edi
        .align   16
2:      movl     (%esi),%edx
        movl     %edx,(%edi,%esi,1)
        subl     $4,%esi
        subl     $1,%ecx
        jnz      2b
        addl     %esi,%edi
        jmp      4f
3:      rep;     smovl
4:      andl     $1,%eax              / suffix count
        jz       5f                   / no suffix
        / copy suffix
        addl     $2,%esi
        addl     $2,%edi
        movw     (%esi),%dx
        movw     %dx,(%edi)
5:      cld
        popl     %edi
        popl     %esi
        ret

        / Support for void Copy::conjoint_jints_atomic(void* from,
        /                                              void* to,
        /                                              size_t count)
        / Equivalent to
        /   arrayof_conjoint_jints
        .align   16
_Copy_conjoint_jints_atomic:
_Copy_arrayof_conjoint_jints:
        pushl    %esi
        movl     4+12(%esp),%ecx      / count
        pushl    %edi
        movl     8+ 4(%esp),%esi      / from
        movl     8+ 8(%esp),%edi      / to
        cmpl     %esi,%edi
        leal     -4(%esi,%ecx,4),%eax / from + count*4 - 4
        jbe      ci_CopyRight
        cmpl     %eax,%edi
        jbe      ci_CopyLeft 
ci_CopyRight:
        cmpl     $32,%ecx
        jbe      2f                   / <= 32 dwords
        rep;     smovl 
        popl     %edi
        popl     %esi
        ret
        .=.+10
2:      subl     %esi,%edi
        jmp      4f
        .align   16
3:      movl     (%esi),%edx
        movl     %edx,(%edi,%esi,1)
        addl     $4,%esi
4:      subl     $1,%ecx
        jge      3b
        popl     %edi
        popl     %esi
        ret
ci_CopyLeft:
        std
        leal     -4(%edi,%ecx,4),%edi / to + count*4 - 4
        cmpl     $32,%ecx
        ja       4f                   / > 32 dwords
        subl     %eax,%edi            / eax == from + count*4 - 4
        jmp      3f
        .align   16
2:      movl     (%eax),%edx
        movl     %edx,(%edi,%eax,1)
        subl     $4,%eax
3:      subl     $1,%ecx
        jge      2b
        cld
        popl     %edi
        popl     %esi
        ret
4:      movl     %eax,%esi            / from + count*4 - 4
        rep;     smovl
        cld
        popl     %edi
        popl     %esi
        ret
	
        / Support for void Copy::conjoint_jlongs_atomic(jlong* from,
        /                                               jlong* to,
        /                                               size_t count)
        /
        / 32-bit
        /
        / count treated as signed
        /
        / if (from > to) {
        /   while (--count >= 0) {
        /     *to++ = *from++;
        /   }
        / } else {
        /   while (--count >= 0) {
        /     to[count] = from[count];
        /   }
        / }
        .align   16
_Copy_conjoint_jlongs_atomic:
        movl     4+8(%esp),%ecx       / count
        movl     4+0(%esp),%eax       / from
        movl     4+4(%esp),%edx       / to
        cmpl     %eax,%edx
        jae      cla_CopyLeft
cla_CopyRight:
        subl     %eax,%edx
        jmp      2f
        .align   16
1:      fildll   (%eax)
        fistpll  (%edx,%eax,1)
        addl     $8,%eax
2:      subl     $1,%ecx
        jge      1b
        ret
        .align   16
3:      fildll   (%eax,%ecx,8)
        fistpll  (%edx,%ecx,8)
cla_CopyLeft:
        subl     $1,%ecx
        jge      3b
        ret

        / Support for void Copy::arrayof_conjoint_jshorts(void* from,
        /                                                 void* to,
        /                                                 size_t count)
       .align   16
_mmx_Copy_arrayof_conjoint_jshorts:
        pushl    %esi
        movl     4+12(%esp),%ecx
        pushl    %edi
        movl     8+ 4(%esp),%esi
        movl     8+ 8(%esp),%edi
        cmpl     %esi,%edi
        leal     -2(%esi,%ecx,2),%eax
        jbe      mmx_acs_CopyRight
        cmpl     %eax,%edi
        jbe      mmx_acs_CopyLeft
mmx_acs_CopyRight:
        movl     %ecx,%eax
        sarl     %ecx
        je       5f
        cmpl     $33,%ecx
        jae      3f
1:      subl     %esi,%edi 
        .align   16
2:      movl     (%esi),%edx
        movl     %edx,(%edi,%esi,1)
        addl     $4,%esi
        subl     $1,%ecx
        jnz      2b
        addl     %esi,%edi
        jmp      5f 
3:      smovl / align to 8 bytes, we know we are 4 byte aligned to start
        subl     $1,%ecx
4:      .align   16
        movq     0(%esi),%mm0
        addl     $64,%edi
        movq     8(%esi),%mm1
        subl     $16,%ecx
        movq     16(%esi),%mm2
        movq     %mm0,-64(%edi)
        movq     24(%esi),%mm0
        movq     %mm1,-56(%edi)
        movq     32(%esi),%mm1
        movq     %mm2,-48(%edi)
        movq     40(%esi),%mm2
        movq     %mm0,-40(%edi)
        movq     48(%esi),%mm0
        movq     %mm1,-32(%edi)
        movq     56(%esi),%mm1
        movq     %mm2,-24(%edi)
        movq     %mm0,-16(%edi)
        addl     $64,%esi
        movq     %mm1,-8(%edi)
        cmpl     $16,%ecx
        jge      4b
        emms
	testl    %ecx,%ecx
	ja       1b
5:      andl     $1,%eax
        je       7f
6:      movw     (%esi),%dx
        movw     %dx,(%edi)
7:      popl     %edi
        popl     %esi
        ret
mmx_acs_CopyLeft:
        std
        leal     -4(%edi,%ecx,2),%edi
        movl     %eax,%esi
        movl     %ecx,%eax
        subl     $2,%esi
        sarl     %ecx
        je       4f
        cmpl     $32,%ecx
        ja       3f
        subl     %esi,%edi
        .align   16
2:      movl     (%esi),%edx
        movl     %edx,(%edi,%esi,1)
        subl     $4,%esi
        subl     $1,%ecx
        jnz      2b
        addl     %esi,%edi
        jmp      4f
3:      rep;     smovl
4:      andl     $1,%eax
        je       6f
        addl     $2,%esi
        addl     $2,%edi
5:      movw     (%esi),%dx
        movw     %dx,(%edi)
6:      cld
        popl     %edi
        popl     %esi
        ret


        / Support for jlong Atomic::cmpxchg(jlong exchange_value,
        /                                   volatile jlong* dest,
        /                                   jlong compare_value)
        / Used only for Solaris/gcc builds
        .align 16
_Atomic_cmpxchg_long_gcc:
                                   /  8(%esp) : return PC
        pushl    %ebx              /  4(%esp) : old %ebx
        pushl    %edi              /  0(%esp) : old %edi
        movl     12(%esp), %ebx    / 12(%esp) : exchange_value (low)
        movl     16(%esp), %ecx    / 16(%esp) : exchange_value (high)
        movl     24(%esp), %eax    / 24(%esp) : compare_value (low)
        movl     28(%esp), %edx    / 28(%esp) : compare_value (high)
        movl     20(%esp), %edi    / 20(%esp) : dest
        lock cmpxchg8b (%edi)
        popl     %edi
        popl     %ebx
        ret