# 
# Copyright (c) 2004, 2013, Oracle and/or its affiliates. All rights reserved.
# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
#
# This code is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License version 2 only, as
# published by the Free Software Foundation.
#
# This code is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
# version 2 for more details (a copy is included in the LICENSE file that
# accompanied this code).
#
# You should have received a copy of the GNU General Public License version
# 2 along with this work; if not, write to the Free Software Foundation,
# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
#
# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
# or visit www.oracle.com if you need additional information or have any
# questions.
#

#ifdef __APPLE__
# Darwin uses _ prefixed global symbols
#define SYMBOL(s) _ ## s
#define ELF_TYPE(name, description)
#else
#define SYMBOL(s) s
#define ELF_TYPE(name, description) .type name,description
#endif

        # NOTE WELL!  The _Copy functions are called directly
	# from server-compiler-generated code via CallLeafNoFP,
	# which means that they *must* either not use floating
	# point or use it in the same manner as does the server
	# compiler.
	
        .globl SYMBOL(_Copy_arrayof_conjoint_bytes)
	.globl SYMBOL(_Copy_arrayof_conjoint_jshorts)
        .globl SYMBOL(_Copy_conjoint_jshorts_atomic)
        .globl SYMBOL(_Copy_arrayof_conjoint_jints)
        .globl SYMBOL(_Copy_conjoint_jints_atomic)
        .globl SYMBOL(_Copy_arrayof_conjoint_jlongs)
        .globl SYMBOL(_Copy_conjoint_jlongs_atomic)

	.text

        .globl SYMBOL(SpinPause)
        .p2align 4,,15
        ELF_TYPE(SpinPause,@function)
SYMBOL(SpinPause):
        rep
        nop
        movq   $1, %rax
        ret

        # Support for void Copy::arrayof_conjoint_bytes(void* from,
        #                                               void* to,
        #                                               size_t count)
        # rdi - from
        # rsi - to
        # rdx - count, treated as ssize_t
        #
        .p2align 4,,15
	ELF_TYPE(_Copy_arrayof_conjoint_bytes,@function)
SYMBOL(_Copy_arrayof_conjoint_bytes):
        movq     %rdx,%r8             # byte count
        shrq     $3,%rdx              # qword count
        cmpq     %rdi,%rsi
        leaq     -1(%rdi,%r8,1),%rax  # from + bcount*1 - 1
        jbe      acb_CopyRight
        cmpq     %rax,%rsi
        jbe      acb_CopyLeft 
acb_CopyRight:
        leaq     -8(%rdi,%rdx,8),%rax # from + qcount*8 - 8
        leaq     -8(%rsi,%rdx,8),%rcx # to + qcount*8 - 8
        negq     %rdx
        jmp      7f
        .p2align 4,,15
1:      movq     8(%rax,%rdx,8),%rsi
        movq     %rsi,8(%rcx,%rdx,8)
        addq     $1,%rdx
        jnz      1b
2:      testq    $4,%r8               # check for trailing dword
        jz       3f
        movl     8(%rax),%esi         # copy trailing dword
        movl     %esi,8(%rcx)
        addq     $4,%rax
        addq     $4,%rcx              # original %rsi is trashed, so we
                                      #  can't use it as a base register
3:      testq    $2,%r8               # check for trailing word
        jz       4f
        movw     8(%rax),%si          # copy trailing word
        movw     %si,8(%rcx)
        addq     $2,%rcx
4:      testq    $1,%r8               # check for trailing byte
        jz       5f
        movb     -1(%rdi,%r8,1),%al   # copy trailing byte
        movb     %al,8(%rcx)
5:      ret
        .p2align 4,,15
6:      movq     -24(%rax,%rdx,8),%rsi
        movq     %rsi,-24(%rcx,%rdx,8)
        movq     -16(%rax,%rdx,8),%rsi
        movq     %rsi,-16(%rcx,%rdx,8)
        movq     -8(%rax,%rdx,8),%rsi
        movq     %rsi,-8(%rcx,%rdx,8)
        movq     (%rax,%rdx,8),%rsi
        movq     %rsi,(%rcx,%rdx,8)
7:      addq     $4,%rdx
        jle      6b
        subq     $4,%rdx
        jl       1b
        jmp      2b
acb_CopyLeft:
        testq    $1,%r8               # check for trailing byte
        jz       1f
        movb     -1(%rdi,%r8,1),%cl   # copy trailing byte
        movb     %cl,-1(%rsi,%r8,1)
        subq     $1,%r8               # adjust for possible trailing word
1:      testq    $2,%r8               # check for trailing word
        jz       2f
        movw     -2(%rdi,%r8,1),%cx   # copy trailing word
        movw     %cx,-2(%rsi,%r8,1)
2:      testq    $4,%r8               # check for trailing dword
        jz       5f
        movl     (%rdi,%rdx,8),%ecx   # copy trailing dword
        movl     %ecx,(%rsi,%rdx,8)
        jmp      5f
        .p2align 4,,15
3:      movq     -8(%rdi,%rdx,8),%rcx
        movq     %rcx,-8(%rsi,%rdx,8)
        subq     $1,%rdx
        jnz      3b
        ret
        .p2align 4,,15
4:      movq     24(%rdi,%rdx,8),%rcx
        movq     %rcx,24(%rsi,%rdx,8)
        movq     16(%rdi,%rdx,8),%rcx
        movq     %rcx,16(%rsi,%rdx,8)
        movq     8(%rdi,%rdx,8),%rcx
        movq     %rcx,8(%rsi,%rdx,8)
        movq     (%rdi,%rdx,8),%rcx
        movq     %rcx,(%rsi,%rdx,8)
5:      subq     $4,%rdx
        jge      4b
        addq     $4,%rdx
        jg       3b
        ret

        # Support for void Copy::arrayof_conjoint_jshorts(void* from,
        #                                                 void* to,
        #                                                 size_t count)
        # Equivalent to
        #   conjoint_jshorts_atomic
        #
        # If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
        # let the hardware handle it.  The tow or four words within dwords
        # or qwords that span cache line boundaries will still be loaded
        # and stored atomically.
        #
        # rdi - from
        # rsi - to
        # rdx - count, treated as ssize_t
        #
        .p2align 4,,15
	ELF_TYPE(_Copy_arrayof_conjoint_jshorts,@function)
	ELF_TYPE(_Copy_conjoint_jshorts_atomic,@function)
SYMBOL(_Copy_arrayof_conjoint_jshorts):
SYMBOL(_Copy_conjoint_jshorts_atomic):
        movq     %rdx,%r8             # word count
        shrq     $2,%rdx              # qword count
        cmpq     %rdi,%rsi
        leaq     -2(%rdi,%r8,2),%rax  # from + wcount*2 - 2
        jbe      acs_CopyRight
        cmpq     %rax,%rsi
        jbe      acs_CopyLeft 
acs_CopyRight:
        leaq     -8(%rdi,%rdx,8),%rax # from + qcount*8 - 8
        leaq     -8(%rsi,%rdx,8),%rcx # to + qcount*8 - 8
        negq     %rdx
        jmp      6f
1:      movq     8(%rax,%rdx,8),%rsi
        movq     %rsi,8(%rcx,%rdx,8)
        addq     $1,%rdx
        jnz      1b
2:      testq    $2,%r8               # check for trailing dword
        jz       3f
        movl     8(%rax),%esi         # copy trailing dword
        movl     %esi,8(%rcx)
        addq     $4,%rcx              # original %rsi is trashed, so we
                                      #  can't use it as a base register
3:      testq    $1,%r8               # check for trailing word
        jz       4f
        movw     -2(%rdi,%r8,2),%si   # copy trailing word
        movw     %si,8(%rcx)
4:      ret
        .p2align 4,,15
5:      movq     -24(%rax,%rdx,8),%rsi
        movq     %rsi,-24(%rcx,%rdx,8)
        movq     -16(%rax,%rdx,8),%rsi
        movq     %rsi,-16(%rcx,%rdx,8)
        movq     -8(%rax,%rdx,8),%rsi
        movq     %rsi,-8(%rcx,%rdx,8)
        movq     (%rax,%rdx,8),%rsi
        movq     %rsi,(%rcx,%rdx,8)
6:      addq     $4,%rdx
        jle      5b
        subq     $4,%rdx
        jl       1b
        jmp      2b
acs_CopyLeft:
        testq    $1,%r8               # check for trailing word
        jz       1f
        movw     -2(%rdi,%r8,2),%cx   # copy trailing word
        movw     %cx,-2(%rsi,%r8,2)
1:      testq    $2,%r8               # check for trailing dword
        jz       4f
        movl     (%rdi,%rdx,8),%ecx   # copy trailing dword
        movl     %ecx,(%rsi,%rdx,8)
        jmp      4f
2:      movq     -8(%rdi,%rdx,8),%rcx
        movq     %rcx,-8(%rsi,%rdx,8)
        subq     $1,%rdx
        jnz      2b
        ret
        .p2align 4,,15
3:      movq     24(%rdi,%rdx,8),%rcx
        movq     %rcx,24(%rsi,%rdx,8)
        movq     16(%rdi,%rdx,8),%rcx
        movq     %rcx,16(%rsi,%rdx,8)
        movq     8(%rdi,%rdx,8),%rcx
        movq     %rcx,8(%rsi,%rdx,8)
        movq     (%rdi,%rdx,8),%rcx
        movq     %rcx,(%rsi,%rdx,8)
4:      subq     $4,%rdx
        jge      3b
        addq     $4,%rdx
        jg       2b
        ret

        # Support for void Copy::arrayof_conjoint_jints(jint* from,
        #                                               jint* to,
        #                                               size_t count)
        # Equivalent to
        #   conjoint_jints_atomic
        #
        # If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
        # the hardware handle it.  The two dwords within qwords that span
        # cache line boundaries will still be loaded and stored atomically.
        #
        # rdi - from
        # rsi - to
        # rdx - count, treated as ssize_t
        #
        .p2align 4,,15
	ELF_TYPE(_Copy_arrayof_conjoint_jints,@function)
	ELF_TYPE(_Copy_conjoint_jints_atomic,@function)
SYMBOL(_Copy_arrayof_conjoint_jints):
SYMBOL(_Copy_conjoint_jints_atomic):
        movq     %rdx,%r8             # dword count
        shrq     %rdx                 # qword count
        cmpq     %rdi,%rsi
        leaq     -4(%rdi,%r8,4),%rax  # from + dcount*4 - 4
        jbe      aci_CopyRight
        cmpq     %rax,%rsi
        jbe      aci_CopyLeft 
aci_CopyRight:
        leaq     -8(%rdi,%rdx,8),%rax # from + qcount*8 - 8
        leaq     -8(%rsi,%rdx,8),%rcx # to + qcount*8 - 8
        negq     %rdx
        jmp      5f
        .p2align 4,,15
1:      movq     8(%rax,%rdx,8),%rsi
        movq     %rsi,8(%rcx,%rdx,8)
        addq     $1,%rdx
        jnz       1b
2:      testq    $1,%r8               # check for trailing dword
        jz       3f
        movl     8(%rax),%esi         # copy trailing dword
        movl     %esi,8(%rcx)
3:      ret
        .p2align 4,,15
4:      movq     -24(%rax,%rdx,8),%rsi
        movq     %rsi,-24(%rcx,%rdx,8)
        movq     -16(%rax,%rdx,8),%rsi
        movq     %rsi,-16(%rcx,%rdx,8)
        movq     -8(%rax,%rdx,8),%rsi
        movq     %rsi,-8(%rcx,%rdx,8)
        movq     (%rax,%rdx,8),%rsi
        movq     %rsi,(%rcx,%rdx,8)
5:      addq     $4,%rdx
        jle      4b
        subq     $4,%rdx
        jl       1b
        jmp      2b
aci_CopyLeft:
        testq    $1,%r8               # check for trailing dword
        jz       3f
        movl     -4(%rdi,%r8,4),%ecx  # copy trailing dword
        movl     %ecx,-4(%rsi,%r8,4)
        jmp      3f
1:      movq     -8(%rdi,%rdx,8),%rcx
        movq     %rcx,-8(%rsi,%rdx,8)
        subq     $1,%rdx
        jnz      1b
        ret
        .p2align 4,,15
2:      movq     24(%rdi,%rdx,8),%rcx
        movq     %rcx,24(%rsi,%rdx,8)
        movq     16(%rdi,%rdx,8),%rcx
        movq     %rcx,16(%rsi,%rdx,8)
        movq     8(%rdi,%rdx,8),%rcx
        movq     %rcx,8(%rsi,%rdx,8)
        movq     (%rdi,%rdx,8),%rcx
        movq     %rcx,(%rsi,%rdx,8)
3:      subq     $4,%rdx
        jge      2b
        addq     $4,%rdx
        jg       1b
        ret

        # Support for void Copy::arrayof_conjoint_jlongs(jlong* from,
        #                                                jlong* to,
        #                                                size_t count)
        # Equivalent to
        #   conjoint_jlongs_atomic
        #   arrayof_conjoint_oops
        #   conjoint_oops_atomic
        #
        # rdi - from
        # rsi - to
        # rdx - count, treated as ssize_t
        #
        .p2align 4,,15
	ELF_TYPE(_Copy_arrayof_conjoint_jlongs,@function)
	ELF_TYPE(_Copy_conjoint_jlongs_atomic,@function)
SYMBOL(_Copy_arrayof_conjoint_jlongs):
SYMBOL(_Copy_conjoint_jlongs_atomic):
        cmpq     %rdi,%rsi
        leaq     -8(%rdi,%rdx,8),%rax # from + count*8 - 8
        jbe      acl_CopyRight
        cmpq     %rax,%rsi
        jbe      acl_CopyLeft 
acl_CopyRight:
        leaq     -8(%rsi,%rdx,8),%rcx # to + count*8 - 8
        negq     %rdx
        jmp      3f
1:      movq     8(%rax,%rdx,8),%rsi
        movq     %rsi,8(%rcx,%rdx,8)
        addq     $1,%rdx
        jnz      1b
        ret
        .p2align 4,,15
2:      movq     -24(%rax,%rdx,8),%rsi
        movq     %rsi,-24(%rcx,%rdx,8)
        movq     -16(%rax,%rdx,8),%rsi
        movq     %rsi,-16(%rcx,%rdx,8)
        movq     -8(%rax,%rdx,8),%rsi
        movq     %rsi,-8(%rcx,%rdx,8)
        movq     (%rax,%rdx,8),%rsi
        movq     %rsi,(%rcx,%rdx,8)
3:      addq     $4,%rdx
        jle      2b
        subq     $4,%rdx
        jl       1b
        ret
4:      movq     -8(%rdi,%rdx,8),%rcx
        movq     %rcx,-8(%rsi,%rdx,8)
        subq     $1,%rdx
        jnz      4b
        ret
        .p2align 4,,15
5:      movq     24(%rdi,%rdx,8),%rcx
        movq     %rcx,24(%rsi,%rdx,8)
        movq     16(%rdi,%rdx,8),%rcx
        movq     %rcx,16(%rsi,%rdx,8)
        movq     8(%rdi,%rdx,8),%rcx
        movq     %rcx,8(%rsi,%rdx,8)
        movq     (%rdi,%rdx,8),%rcx
        movq     %rcx,(%rsi,%rdx,8)
acl_CopyLeft:
        subq     $4,%rdx
        jge      5b
        addq     $4,%rdx
        jg       4b
        ret
