jdk-sandbox: changeset 29183:0cc8699f7372

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/aarch64Test.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include <stdlib.h>
+
+#include "precompiled.hpp"
+#include "code/codeBlob.hpp"
+#include "asm/macroAssembler.hpp"
+
+// hook routine called during JVM bootstrap to test AArch64 assembler
+
+extern "C" void entry(CodeBuffer*);
+
+void aarch64TestHook()
+{
+  BufferBlob* b = BufferBlob::create("aarch64Test", 500000);
+  CodeBuffer code(b);
+  MacroAssembler _masm(&code);
+  entry(&code);
+}

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/aarch64_call.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifdef BUILTIN_SIM
+
+#include <stdio.h>
+#include <sys/types.h>
+#include "asm/macroAssembler.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "../../../../../../simulator/cpustate.hpp"
+#include "../../../../../../simulator/simulator.hpp"
+
+/*
+ * a routine to initialise and enter ARM simulator execution when
+ * calling into ARM code from x86 code.
+ *
+ * we maintain a simulator per-thread and provide it with 8 Mb of
+ * stack space
+ */
+#define SIM_STACK_SIZE (1024 * 1024) // in units of u_int64_t
+
+extern "C" u_int64_t get_alt_stack()
+{
+  return AArch64Simulator::altStack();
+}
+
+extern "C" void setup_arm_sim(void *sp, u_int64_t calltype)
+{
+  // n.b. this function runs on the simulator stack so as to avoid
+  // simulator frames appearing in between VM x86 and ARM frames. note
+  // that arfgument sp points to the old (VM) stack from which the
+  // call into the sim was made. The stack switch and entry into this
+  // routine is handled by x86 prolog code planted in the head of the
+  // ARM code buffer which the sim is about to start executing (see
+  // aarch64_linkage.S).
+  //
+  // The first ARM instruction in the buffer is identified by fnptr
+  // stored at the top of the old stack. x86 register contents precede
+  // fnptr. preceding that are the fp and return address of the VM
+  // caller into ARM code. any extra, non-register arguments passed to
+  // the linkage routine precede the fp (this is as per any normal x86
+  // call wirth extra args).
+  //
+  // note that the sim creates Java frames on the Java stack just
+  // above sp (i.e. directly above fnptr). it sets the sim FP register
+  // to the pushed fp for the caller effectively eliding the register
+  // data saved by the linkage routine.
+  //
+  // x86 register call arguments are loaded from the stack into ARM
+  // call registers. if extra arguments occur preceding the x86
+  // caller's fp then they are copied either into extra ARM registers
+  // (ARM has 8 rather than 6 gp call registers) or up the stack
+  // beyond the saved x86 registers so that they immediately precede
+  // the ARM frame where the ARM calling convention expects them to
+  // be.
+  //
+  // n.b. the number of register/stack values passed to the ARM code
+  // is determined by calltype
+  //
+  // +--------+
+  // | fnptr  |  <--- argument sp points here
+  // +--------+  |
+  // | rax    |  | return slot if we need to return a value
+  // +--------+  |
+  // | rdi    |  increasing
+  // +--------+  address
+  // | rsi    |  |
+  // +--------+  V
+  // | rdx    |
+  // +--------+
+  // | rcx    |
+  // +--------+
+  // | r8     |
+  // +--------+
+  // | r9     |
+  // +--------+
+  // | xmm0   |
+  // +--------+
+  // | xmm1   |
+  // +--------+
+  // | xmm2   |
+  // +--------+
+  // | xmm3   |
+  // +--------+
+  // | xmm4   |
+  // +--------+
+  // | xmm5   |
+  // +--------+
+  // | xmm6   |
+  // +--------+
+  // | xmm7   |
+  // +--------+
+  // | fp     |
+  // +--------+
+  // | caller |
+  // | ret ip |
+  // +--------+
+  // | arg0   | <-- any extra call args start here
+  // +--------+     offset = 18 * wordSize
+  // | . . .  |     (i.e. 1 * calladdr + 1 * rax  + 6 * gp call regs
+  //                      + 8 * fp call regs + 2 * frame words)
+  //
+  // we use a unique sim/stack per thread
+  const int cursor2_offset = 18;
+  const int fp_offset = 16;
+  u_int64_t *cursor = (u_int64_t *)sp;
+  u_int64_t *cursor2 = ((u_int64_t *)sp) + cursor2_offset;
+  u_int64_t *fp = ((u_int64_t *)sp) + fp_offset;
+  int gp_arg_count = calltype & 0xf;
+  int fp_arg_count = (calltype >> 4) & 0xf;
+  int return_type = (calltype >> 8) & 0x3;
+  AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
+  // save previous cpu state in case this is a recursive entry
+  CPUState saveState = sim->getCPUState();
+  // set up initial sim pc, sp and fp registers
+  sim->init(*cursor++, (u_int64_t)sp, (u_int64_t)fp);
+  u_int64_t *return_slot = cursor++;
+
+  // if we need to pass the sim extra args on the stack then bump
+  // the stack pointer now
+  u_int64_t *cursor3 = (u_int64_t *)sim->getCPUState().xreg(SP, 1);
+  if (gp_arg_count > 8) {
+    cursor3 -= gp_arg_count - 8;
+  }
+  if (fp_arg_count > 8) {
+    cursor3 -= fp_arg_count - 8;
+  }
+  sim->getCPUState().xreg(SP, 1) = (u_int64_t)(cursor3++);
+
+  for (int i = 0; i < gp_arg_count; i++) {
+    if (i < 6) {
+      // copy saved register to sim register
+      GReg reg = (GReg)i;
+      sim->getCPUState().xreg(reg, 0) = *cursor++;
+    } else if (i < 8) {
+      // copy extra int arg to sim register
+      GReg reg = (GReg)i;
+      sim->getCPUState().xreg(reg, 0) = *cursor2++;
+    } else {
+      // copy extra fp arg to sim stack
+      *cursor3++ = *cursor2++;
+    }
+  }
+  for (int i = 0; i < fp_arg_count; i++) {
+    if (i < 8) {
+      // copy saved register to sim register
+      GReg reg = (GReg)i;
+      sim->getCPUState().xreg(reg, 0) = *cursor++;
+    } else {
+      // copy extra arg to sim stack
+      *cursor3++ = *cursor2++;
+    }
+  }
+  AArch64Simulator::status_t return_status = sim->run();
+  if (return_status != AArch64Simulator::STATUS_RETURN){
+    sim->simPrint0();
+    fatal("invalid status returned from simulator.run()\n");
+  }
+  switch (return_type) {
+  case MacroAssembler::ret_type_void:
+  default:
+    break;
+  case MacroAssembler::ret_type_integral:
+  // this overwrites the saved r0
+    *return_slot = sim->getCPUState().xreg(R0, 0);
+    break;
+  case MacroAssembler::ret_type_float:
+    *(float *)return_slot = sim->getCPUState().sreg(V0);
+    break;
+  case MacroAssembler::ret_type_double:
+    *(double *)return_slot = sim->getCPUState().dreg(V0);
+    break;
+  }
+  // restore incoimng cpu state
+  sim->getCPUState() = saveState;
+}
+
+#endif

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/aarch64_linkage.S	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,167 @@
+#
+# Copyright (c) 2012, Red Hat. All rights reserved.
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This code is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License version 2 only, as
+# published by the Free Software Foundation.
+#
+# This code is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# version 2 for more details (a copy is included in the LICENSE file that
+# accompanied this code).
+#
+# You should have received a copy of the GNU General Public License version
+# 2 along with this work; if not, write to the Free Software Foundation,
+# Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+# or visit www.oracle.com if you need additional information or have any
+# questions.
+
+# Routines used to enable x86 VM C++ code to invoke JIT-compiled ARM code
+# -- either Java methods or generated stub -- and to allow JIT-compiled
+# ARM code to invoke x86 VM C++ code
+#
+# the code for aarch64_stub_prolog below can be copied into the start
+# of the ARM code buffer and patched with a link to the
+# C++ routine which starts execution on the simulator. the ARM
+# code can be generated immediately following the copied code.
+
+#ifdef BUILTIN_SIM
+
+	.data
+        .globl setup_arm_sim,
+	.type  setup_arm_sim,@function
+        .globl get_alt_stack,
+	.type  get_alt_stack,@function
+        .globl aarch64_stub_prolog
+        .p2align  4
+aarch64_stub_prolog:
+	// entry point
+4:	lea 1f(%rip), %r11
+	mov (%r11), %r10
+	mov (%r10), %r10
+	jmp *%r10
+	.p2align 4
+1:
+	.set entry_offset, . - 1b
+	.quad aarch64_prolog_ptr
+	// 64 bit int used to idenitfy called fn arg/return types
+	.set calltype_offset, . - 1b
+	.quad 0
+	// arm JIT code follows the stub
+	.set arm_code_offset, . - 1b
+	.size aarch64_stub_prolog, .-aarch64_stub_prolog
+aarch64_stub_prolog_end:
+
+	.text
+aarch64_prolog_ptr:
+	.quad aarch64_prolog
+
+        .globl aarch64_prolog
+aarch64_prolog:
+	.cfi_startproc
+	pushq	%rbp
+	.cfi_def_cfa_offset 16
+	.cfi_offset 6, -16
+	movq	%rsp, %rbp
+	.cfi_def_cfa_register 6
+	// save all registers used to pass args
+	sub $8, %rsp
+	movd %xmm7, (%rsp)
+	sub $8, %rsp
+	movd %xmm6, (%rsp)
+	sub $8, %rsp
+	movd %xmm5, (%rsp)
+	sub $8, %rsp
+	movd %xmm4, (%rsp)
+	sub $8, %rsp
+	movd %xmm3, (%rsp)
+	sub $8, %rsp
+	movd %xmm2, (%rsp)
+	sub $8, %rsp
+	movd %xmm1, (%rsp)
+	sub $8, %rsp
+	movd %xmm0, (%rsp)
+	push %r9
+	push %r8
+	push %rcx
+	push %rdx
+	push %rsi
+	push %rdi
+	// save rax -- this stack slot will be rewritten with a
+	// return value if needed
+	push %rax
+	// temporarily save r11 while we find the other stack
+	push %r11
+	// retrieve alt stack
+	call get_alt_stack@PLT
+	pop %r11
+	// push start of arm code
+	lea (arm_code_offset)(%r11), %rsi
+	push %rsi
+	// load call type code in arg reg 1
+	mov (calltype_offset)(%r11), %rsi
+	// load current stack pointer in arg reg 0
+	mov %rsp, %rdi
+	// switch to alt stack
+	mov %rax, %rsp
+	// save previous stack pointer on new stack
+	push %rdi
+	// 16-align the new stack pointer
+	push %rdi
+	// call sim setup routine
+	call setup_arm_sim@PLT
+	// switch back to old stack
+	pop %rsp
+	// pop start of arm code
+	pop %rdi
+	// pop rax -- either restores old value or installs return value
+	pop %rax
+	// pop arg registers
+	pop %rdi
+	pop %rsi
+	pop %rdx
+	pop %rcx
+	pop %r8
+	pop %r9
+	movd (%rsp), %xmm0
+	add $8, %rsp
+	movd (%rsp), %xmm1
+	add $8, %rsp
+	movd (%rsp), %xmm2
+	add $8, %rsp
+	movd (%rsp), %xmm3
+	add $8, %rsp
+	movd (%rsp), %xmm4
+	add $8, %rsp
+	movd (%rsp), %xmm5
+	add $8, %rsp
+	movd (%rsp), %xmm6
+	add $8, %rsp
+	movd (%rsp), %xmm7
+	add $8, %rsp
+	leave
+	.cfi_def_cfa 7, 8
+	ret
+	.cfi_endproc
+
+
+        .p2align  4
+get_pc:
+	// get return pc in rdi and then push it back
+	pop %rdi
+	push %rdi
+	ret
+
+	.p2align 4
+	.long
+	.globl aarch64_stub_prolog_size
+	.type  aarch64_stub_prolog_size,@function
+aarch64_stub_prolog_size:
+	leaq  aarch64_stub_prolog_end - aarch64_stub_prolog, %rax
+	ret
+
+#endif

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,1526 @@
+/*
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * reserved.  DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE
+ * HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include <stdio.h>
+#include <sys/types.h>
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "asm/assembler.inline.hpp"
+#include "interpreter/interpreter.hpp"
+
+#ifndef PRODUCT
+const unsigned long Assembler::asm_bp = 0x00007fffee09ac88;
+#endif
+
+#include "compiler/disassembler.hpp"
+#include "memory/resourceArea.hpp"
+#include "runtime/interfaceSupport.hpp"
+#include "runtime/sharedRuntime.hpp"
+
+// for the moment we reuse the logical/floating point immediate encode
+// and decode functiosn provided by the simulator. when we move to
+// real hardware we will need to pull taht code into here
+
+#include "immediate_aarch64.hpp"
+
+extern "C" void entry(CodeBuffer *cb);
+
+#define __ _masm.
+#ifdef PRODUCT
+#define BLOCK_COMMENT(str) /* nothing */
+#else
+#define BLOCK_COMMENT(str) block_comment(str)
+#endif
+
+#define BIND(label) bind(label); __ BLOCK_COMMENT(#label ":")
+
+static float unpack(unsigned value);
+
+void entry(CodeBuffer *cb) {
+
+  // {
+  //   for (int i = 0; i < 256; i+=16)
+  //     {
+  //    printf("\"%20.20g\", ", unpack(i));
+  //    printf("\"%20.20g\", ", unpack(i+1));
+  //     }
+  //   printf("\n");
+  // }
+
+  Assembler _masm(cb);
+  address entry = __ pc();
+
+  // Smoke test for assembler
+
+#ifdef ASSERT
+// BEGIN  Generated code -- do not edit
+// Generated by aarch64-asmtest.py
+    Label back, forth;
+    __ bind(back);
+
+// ArithOp
+    __ add(r19, r22, r7, Assembler::LSL, 28);          //       add     x19, x22, x7, LSL #28
+    __ sub(r16, r11, r10, Assembler::LSR, 13);         //       sub     x16, x11, x10, LSR #13
+    __ adds(r27, r13, r28, Assembler::ASR, 2);         //       adds    x27, x13, x28, ASR #2
+    __ subs(r20, r28, r26, Assembler::ASR, 41);        //       subs    x20, x28, x26, ASR #41
+    __ addw(r8, r19, r19, Assembler::ASR, 19);         //       add     w8, w19, w19, ASR #19
+    __ subw(r4, r9, r10, Assembler::LSL, 14);          //       sub     w4, w9, w10, LSL #14
+    __ addsw(r8, r11, r30, Assembler::LSL, 13);        //       adds    w8, w11, w30, LSL #13
+    __ subsw(r0, r25, r19, Assembler::LSL, 9);         //       subs    w0, w25, w19, LSL #9
+    __ andr(r20, r0, r21, Assembler::LSL, 19);         //       and     x20, x0, x21, LSL #19
+    __ orr(r21, r14, r20, Assembler::LSL, 17);         //       orr     x21, x14, x20, LSL #17
+    __ eor(r25, r28, r1, Assembler::LSL, 51);          //       eor     x25, x28, x1, LSL #51
+    __ ands(r10, r27, r11, Assembler::ASR, 15);        //       ands    x10, x27, x11, ASR #15
+    __ andw(r25, r5, r12, Assembler::ASR, 23);         //       and     w25, w5, w12, ASR #23
+    __ orrw(r18, r14, r10, Assembler::LSR, 4);         //       orr     w18, w14, w10, LSR #4
+    __ eorw(r4, r21, r5, Assembler::ASR, 22);          //       eor     w4, w21, w5, ASR #22
+    __ andsw(r21, r0, r5, Assembler::ASR, 29);         //       ands    w21, w0, w5, ASR #29
+    __ bic(r26, r30, r6, Assembler::ASR, 37);          //       bic     x26, x30, x6, ASR #37
+    __ orn(r3, r1, r13, Assembler::LSR, 29);           //       orn     x3, x1, x13, LSR #29
+    __ eon(r0, r28, r9, Assembler::LSL, 47);           //       eon     x0, x28, x9, LSL #47
+    __ bics(r29, r5, r28, Assembler::LSL, 46);         //       bics    x29, x5, x28, LSL #46
+    __ bicw(r9, r18, r7, Assembler::LSR, 20);          //       bic     w9, w18, w7, LSR #20
+    __ ornw(r26, r13, r25, Assembler::ASR, 24);        //       orn     w26, w13, w25, ASR #24
+    __ eonw(r25, r4, r19, Assembler::LSL, 6);          //       eon     w25, w4, w19, LSL #6
+    __ bicsw(r5, r26, r4, Assembler::LSR, 24);         //       bics    w5, w26, w4, LSR #24
+
+// AddSubImmOp
+    __ addw(r7, r19, 340u);                            //       add     w7, w19, #340
+    __ addsw(r8, r0, 401u);                            //       adds    w8, w0, #401
+    __ subw(r29, r20, 163u);                           //       sub     w29, w20, #163
+    __ subsw(r8, r23, 759u);                           //       subs    w8, w23, #759
+    __ add(r1, r12, 523u);                             //       add     x1, x12, #523
+    __ adds(r2, r11, 426u);                            //       adds    x2, x11, #426
+    __ sub(r14, r29, 716u);                            //       sub     x14, x29, #716
+    __ subs(r11, r5, 582u);                            //       subs    x11, x5, #582
+
+// LogicalImmOp
+    __ andw(r23, r22, 32768ul);                        //       and     w23, w22, #0x8000
+    __ orrw(r4, r10, 4042322160ul);                    //       orr     w4, w10, #0xf0f0f0f0
+    __ eorw(r0, r24, 4042322160ul);                    //       eor     w0, w24, #0xf0f0f0f0
+    __ andsw(r19, r29, 2139127680ul);                  //       ands    w19, w29, #0x7f807f80
+    __ andr(r5, r10, 4503599627354112ul);              //       and     x5, x10, #0xfffffffffc000
+    __ orr(r12, r30, 18445618178097414144ul);          //       orr     x12, x30, #0xfffc0000fffc0000
+    __ eor(r30, r5, 262128ul);                         //       eor     x30, x5, #0x3fff0
+    __ ands(r26, r23, 4194300ul);                      //       ands    x26, x23, #0x3ffffc
+
+// AbsOp
+    __ b(__ pc());                                     //       b       .
+    __ b(back);                                        //       b       back
+    __ b(forth);                                       //       b       forth
+    __ bl(__ pc());                                    //       bl      .
+    __ bl(back);                                       //       bl      back
+    __ bl(forth);                                      //       bl      forth
+
+// RegAndAbsOp
+    __ cbzw(r12, __ pc());                             //       cbz     w12, .
+    __ cbzw(r12, back);                                //       cbz     w12, back
+    __ cbzw(r12, forth);                               //       cbz     w12, forth
+    __ cbnzw(r20, __ pc());                            //       cbnz    w20, .
+    __ cbnzw(r20, back);                               //       cbnz    w20, back
+    __ cbnzw(r20, forth);                              //       cbnz    w20, forth
+    __ cbz(r12, __ pc());                              //       cbz     x12, .
+    __ cbz(r12, back);                                 //       cbz     x12, back
+    __ cbz(r12, forth);                                //       cbz     x12, forth
+    __ cbnz(r24, __ pc());                             //       cbnz    x24, .
+    __ cbnz(r24, back);                                //       cbnz    x24, back
+    __ cbnz(r24, forth);                               //       cbnz    x24, forth
+    __ adr(r6, __ pc());                               //       adr     x6, .
+    __ adr(r6, back);                                  //       adr     x6, back
+    __ adr(r6, forth);                                 //       adr     x6, forth
+    __ _adrp(r21, __ pc());                             //      adrp    x21, .
+
+// RegImmAbsOp
+    __ tbz(r1, 1, __ pc());                            //       tbz     x1, #1, .
+    __ tbz(r1, 1, back);                               //       tbz     x1, #1, back
+    __ tbz(r1, 1, forth);                              //       tbz     x1, #1, forth
+    __ tbnz(r8, 9, __ pc());                           //       tbnz    x8, #9, .
+    __ tbnz(r8, 9, back);                              //       tbnz    x8, #9, back
+    __ tbnz(r8, 9, forth);                             //       tbnz    x8, #9, forth
+
+// MoveWideImmOp
+    __ movnw(r12, 23175, 0);                           //       movn    w12, #23175, lsl 0
+    __ movzw(r11, 20476, 16);                          //       movz    w11, #20476, lsl 16
+    __ movkw(r21, 3716, 0);                            //       movk    w21, #3716, lsl 0
+    __ movn(r29, 28661, 48);                           //       movn    x29, #28661, lsl 48
+    __ movz(r3, 6927, 0);                              //       movz    x3, #6927, lsl 0
+    __ movk(r22, 9828, 16);                            //       movk    x22, #9828, lsl 16
+
+// BitfieldOp
+    __ sbfm(r12, r8, 6, 22);                           //       sbfm    x12, x8, #6, #22
+    __ bfmw(r19, r25, 25, 19);                         //       bfm     w19, w25, #25, #19
+    __ ubfmw(r9, r12, 29, 15);                         //       ubfm    w9, w12, #29, #15
+    __ sbfm(r28, r25, 16, 16);                         //       sbfm    x28, x25, #16, #16
+    __ bfm(r12, r5, 4, 25);                            //       bfm     x12, x5, #4, #25
+    __ ubfm(r0, r10, 6, 8);                            //       ubfm    x0, x10, #6, #8
+
+// ExtractOp
+    __ extrw(r4, r13, r26, 24);                        //       extr    w4, w13, w26, #24
+    __ extr(r23, r30, r24, 31);                        //       extr    x23, x30, x24, #31
+
+// CondBranchOp
+    __ br(Assembler::EQ, __ pc());                     //       b.EQ    .
+    __ br(Assembler::EQ, back);                        //       b.EQ    back
+    __ br(Assembler::EQ, forth);                       //       b.EQ    forth
+    __ br(Assembler::NE, __ pc());                     //       b.NE    .
+    __ br(Assembler::NE, back);                        //       b.NE    back
+    __ br(Assembler::NE, forth);                       //       b.NE    forth
+    __ br(Assembler::HS, __ pc());                     //       b.HS    .
+    __ br(Assembler::HS, back);                        //       b.HS    back
+    __ br(Assembler::HS, forth);                       //       b.HS    forth
+    __ br(Assembler::CS, __ pc());                     //       b.CS    .
+    __ br(Assembler::CS, back);                        //       b.CS    back
+    __ br(Assembler::CS, forth);                       //       b.CS    forth
+    __ br(Assembler::LO, __ pc());                     //       b.LO    .
+    __ br(Assembler::LO, back);                        //       b.LO    back
+    __ br(Assembler::LO, forth);                       //       b.LO    forth
+    __ br(Assembler::CC, __ pc());                     //       b.CC    .
+    __ br(Assembler::CC, back);                        //       b.CC    back
+    __ br(Assembler::CC, forth);                       //       b.CC    forth
+    __ br(Assembler::MI, __ pc());                     //       b.MI    .
+    __ br(Assembler::MI, back);                        //       b.MI    back
+    __ br(Assembler::MI, forth);                       //       b.MI    forth
+    __ br(Assembler::PL, __ pc());                     //       b.PL    .
+    __ br(Assembler::PL, back);                        //       b.PL    back
+    __ br(Assembler::PL, forth);                       //       b.PL    forth
+    __ br(Assembler::VS, __ pc());                     //       b.VS    .
+    __ br(Assembler::VS, back);                        //       b.VS    back
+    __ br(Assembler::VS, forth);                       //       b.VS    forth
+    __ br(Assembler::VC, __ pc());                     //       b.VC    .
+    __ br(Assembler::VC, back);                        //       b.VC    back
+    __ br(Assembler::VC, forth);                       //       b.VC    forth
+    __ br(Assembler::HI, __ pc());                     //       b.HI    .
+    __ br(Assembler::HI, back);                        //       b.HI    back
+    __ br(Assembler::HI, forth);                       //       b.HI    forth
+    __ br(Assembler::LS, __ pc());                     //       b.LS    .
+    __ br(Assembler::LS, back);                        //       b.LS    back
+    __ br(Assembler::LS, forth);                       //       b.LS    forth
+    __ br(Assembler::GE, __ pc());                     //       b.GE    .
+    __ br(Assembler::GE, back);                        //       b.GE    back
+    __ br(Assembler::GE, forth);                       //       b.GE    forth
+    __ br(Assembler::LT, __ pc());                     //       b.LT    .
+    __ br(Assembler::LT, back);                        //       b.LT    back
+    __ br(Assembler::LT, forth);                       //       b.LT    forth
+    __ br(Assembler::GT, __ pc());                     //       b.GT    .
+    __ br(Assembler::GT, back);                        //       b.GT    back
+    __ br(Assembler::GT, forth);                       //       b.GT    forth
+    __ br(Assembler::LE, __ pc());                     //       b.LE    .
+    __ br(Assembler::LE, back);                        //       b.LE    back
+    __ br(Assembler::LE, forth);                       //       b.LE    forth
+    __ br(Assembler::AL, __ pc());                     //       b.AL    .
+    __ br(Assembler::AL, back);                        //       b.AL    back
+    __ br(Assembler::AL, forth);                       //       b.AL    forth
+    __ br(Assembler::NV, __ pc());                     //       b.NV    .
+    __ br(Assembler::NV, back);                        //       b.NV    back
+    __ br(Assembler::NV, forth);                       //       b.NV    forth
+
+// ImmOp
+    __ svc(12729);                                     //       svc     #12729
+    __ hvc(6788);                                      //       hvc     #6788
+    __ smc(1535);                                      //       smc     #1535
+    __ brk(16766);                                     //       brk     #16766
+    __ hlt(9753);                                      //       hlt     #9753
+
+// Op
+    __ nop();                                          //       nop
+    __ eret();                                         //       eret
+    __ drps();                                         //       drps
+    __ isb();                                          //       isb
+
+// SystemOp
+    __ dsb(Assembler::SY);                             //       dsb     SY
+    __ dmb(Assembler::ISHST);                          //       dmb     ISHST
+
+// OneRegOp
+    __ br(r2);                                         //       br      x2
+    __ blr(r5);                                        //       blr     x5
+
+// LoadStoreExclusiveOp
+    __ stxr(r20, r21, r2);                             //       stxr    w20, x21, [x2]
+    __ stlxr(r7, r29, r7);                             //       stlxr   w7, x29, [x7]
+    __ ldxr(r5, r16);                                  //       ldxr    x5, [x16]
+    __ ldaxr(r27, r29);                                //       ldaxr   x27, [x29]
+    __ stlr(r0, r29);                                  //       stlr    x0, [x29]
+    __ ldar(r21, r28);                                 //       ldar    x21, [x28]
+
+// LoadStoreExclusiveOp
+    __ stxrw(r24, r24, r7);                            //       stxr    w24, w24, [x7]
+    __ stlxrw(r21, r26, r28);                          //       stlxr   w21, w26, [x28]
+    __ ldxrw(r21, r6);                                 //       ldxr    w21, [x6]
+    __ ldaxrw(r15, r30);                               //       ldaxr   w15, [x30]
+    __ stlrw(r19, r3);                                 //       stlr    w19, [x3]
+    __ ldarw(r22, r2);                                 //       ldar    w22, [x2]
+
+// LoadStoreExclusiveOp
+    __ stxrh(r18, r15, r0);                            //       stxrh   w18, w15, [x0]
+    __ stlxrh(r11, r5, r28);                           //       stlxrh  w11, w5, [x28]
+    __ ldxrh(r29, r6);                                 //       ldxrh   w29, [x6]
+    __ ldaxrh(r18, r7);                                //       ldaxrh  w18, [x7]
+    __ stlrh(r25, r28);                                //       stlrh   w25, [x28]
+    __ ldarh(r2, r19);                                 //       ldarh   w2, [x19]
+
+// LoadStoreExclusiveOp
+    __ stxrb(r10, r30, r1);                            //       stxrb   w10, w30, [x1]
+    __ stlxrb(r20, r21, r22);                          //       stlxrb  w20, w21, [x22]
+    __ ldxrb(r25, r2);                                 //       ldxrb   w25, [x2]
+    __ ldaxrb(r24, r5);                                //       ldaxrb  w24, [x5]
+    __ stlrb(r16, r3);                                 //       stlrb   w16, [x3]
+    __ ldarb(r22, r29);                                //       ldarb   w22, [x29]
+
+// LoadStoreExclusiveOp
+    __ ldxp(r8, r2, r19);                              //       ldxp    x8, x2, [x19]
+    __ ldaxp(r7, r19, r14);                            //       ldaxp   x7, x19, [x14]
+    __ stxp(r8, r27, r28, r5);                         //       stxp    w8, x27, x28, [x5]
+    __ stlxp(r6, r8, r14, r6);                         //       stlxp   w6, x8, x14, [x6]
+
+// LoadStoreExclusiveOp
+    __ ldxpw(r25, r4, r22);                            //       ldxp    w25, w4, [x22]
+    __ ldaxpw(r14, r14, r15);                          //       ldaxp   w14, w14, [x15]
+    __ stxpw(r20, r26, r8, r10);                       //       stxp    w20, w26, w8, [x10]
+    __ stlxpw(r23, r18, r18, r18);                     //       stlxp   w23, w18, w18, [x18]
+
+// base_plus_unscaled_offset
+// LoadStoreOp
+    __ str(r30, Address(r11, 99));                     //       str     x30, [x11, 99]
+    __ strw(r23, Address(r25, -77));                   //       str     w23, [x25, -77]
+    __ strb(r2, Address(r14, 3));                      //       strb    w2, [x14, 3]
+    __ strh(r9, Address(r10, 5));                      //       strh    w9, [x10, 5]
+    __ ldr(r20, Address(r15, 57));                     //       ldr     x20, [x15, 57]
+    __ ldrw(r12, Address(r16, -78));                   //       ldr     w12, [x16, -78]
+    __ ldrb(r22, Address(r26, -3));                    //       ldrb    w22, [x26, -3]
+    __ ldrh(r30, Address(r19, -47));                   //       ldrh    w30, [x19, -47]
+    __ ldrsb(r9, Address(r10, -12));                   //       ldrsb   x9, [x10, -12]
+    __ ldrsh(r28, Address(r17, 14));                   //       ldrsh   x28, [x17, 14]
+    __ ldrshw(r3, Address(r5, 10));                    //       ldrsh   w3, [x5, 10]
+    __ ldrsw(r17, Address(r17, -91));                  //       ldrsw   x17, [x17, -91]
+    __ ldrd(v2, Address(r20, -17));                    //       ldr     d2, [x20, -17]
+    __ ldrs(v22, Address(r7, -10));                    //       ldr     s22, [x7, -10]
+    __ strd(v30, Address(r18, -223));                  //       str     d30, [x18, -223]
+    __ strs(v13, Address(r22, 21));                    //       str     s13, [x22, 21]
+
+// pre
+// LoadStoreOp
+    __ str(r9, Address(__ pre(r18, -112)));            //       str     x9, [x18, -112]!
+    __ strw(r29, Address(__ pre(r23, 11)));            //       str     w29, [x23, 11]!
+    __ strb(r18, Address(__ pre(r12, -1)));            //       strb    w18, [x12, -1]!
+    __ strh(r16, Address(__ pre(r20, -23)));           //       strh    w16, [x20, -23]!
+    __ ldr(r3, Address(__ pre(r29, 9)));               //       ldr     x3, [x29, 9]!
+    __ ldrw(r25, Address(__ pre(r3, 19)));             //       ldr     w25, [x3, 19]!
+    __ ldrb(r1, Address(__ pre(r29, -1)));             //       ldrb    w1, [x29, -1]!
+    __ ldrh(r8, Address(__ pre(r29, -57)));            //       ldrh    w8, [x29, -57]!
+    __ ldrsb(r5, Address(__ pre(r14, -13)));           //       ldrsb   x5, [x14, -13]!
+    __ ldrsh(r10, Address(__ pre(r27, 1)));            //       ldrsh   x10, [x27, 1]!
+    __ ldrshw(r11, Address(__ pre(r10, 25)));          //       ldrsh   w11, [x10, 25]!
+    __ ldrsw(r4, Address(__ pre(r22, -92)));           //       ldrsw   x4, [x22, -92]!
+    __ ldrd(v11, Address(__ pre(r23, 8)));             //       ldr     d11, [x23, 8]!
+    __ ldrs(v25, Address(__ pre(r19, 54)));            //       ldr     s25, [x19, 54]!
+    __ strd(v1, Address(__ pre(r7, -174)));            //       str     d1, [x7, -174]!
+    __ strs(v8, Address(__ pre(r25, 54)));             //       str     s8, [x25, 54]!
+
+// post
+// LoadStoreOp
+    __ str(r5, Address(__ post(r11, 37)));             //       str     x5, [x11], 37
+    __ strw(r24, Address(__ post(r15, 19)));           //       str     w24, [x15], 19
+    __ strb(r15, Address(__ post(r26, -1)));           //       strb    w15, [x26], -1
+    __ strh(r18, Address(__ post(r18, -6)));           //       strh    w18, [x18], -6
+    __ ldr(r7, Address(__ post(r2, -230)));            //       ldr     x7, [x2], -230
+    __ ldrw(r27, Address(__ post(r11, -27)));          //       ldr     w27, [x11], -27
+    __ ldrb(r18, Address(__ post(r3, -25)));           //       ldrb    w18, [x3], -25
+    __ ldrh(r10, Address(__ post(r24, -32)));          //       ldrh    w10, [x24], -32
+    __ ldrsb(r22, Address(__ post(r10, 4)));           //       ldrsb   x22, [x10], 4
+    __ ldrsh(r17, Address(__ post(r12, 25)));          //       ldrsh   x17, [x12], 25
+    __ ldrshw(r8, Address(__ post(r7, -62)));          //       ldrsh   w8, [x7], -62
+    __ ldrsw(r23, Address(__ post(r22, -51)));         //       ldrsw   x23, [x22], -51
+    __ ldrd(v24, Address(__ post(r25, 48)));           //       ldr     d24, [x25], 48
+    __ ldrs(v21, Address(__ post(r12, -10)));          //       ldr     s21, [x12], -10
+    __ strd(v18, Address(__ post(r13, -222)));         //       str     d18, [x13], -222
+    __ strs(v16, Address(__ post(r1, -41)));           //       str     s16, [x1], -41
+
+// base_plus_reg
+// LoadStoreOp
+    __ str(r2, Address(r22, r15, Address::sxtw(0)));   //       str     x2, [x22, w15, sxtw #0]
+    __ strw(r2, Address(r16, r29, Address::lsl(0)));   //       str     w2, [x16, x29, lsl #0]
+    __ strb(r20, Address(r18, r14, Address::uxtw(0))); //       strb    w20, [x18, w14, uxtw #0]
+    __ strh(r6, Address(r19, r20, Address::sxtx(1)));  //       strh    w6, [x19, x20, sxtx #1]
+    __ ldr(r14, Address(r29, r14, Address::sxtw(0)));  //       ldr     x14, [x29, w14, sxtw #0]
+    __ ldrw(r16, Address(r20, r12, Address::sxtw(2))); //       ldr     w16, [x20, w12, sxtw #2]
+    __ ldrb(r9, Address(r12, r0, Address::sxtw(0)));   //       ldrb    w9, [x12, w0, sxtw #0]
+    __ ldrh(r12, Address(r17, r3, Address::lsl(1)));   //       ldrh    w12, [x17, x3, lsl #1]
+    __ ldrsb(r2, Address(r17, r3, Address::sxtx(0)));  //       ldrsb   x2, [x17, x3, sxtx #0]
+    __ ldrsh(r7, Address(r1, r17, Address::uxtw(1)));  //       ldrsh   x7, [x1, w17, uxtw #1]
+    __ ldrshw(r25, Address(r15, r18, Address::sxtw(1))); //     ldrsh   w25, [x15, w18, sxtw #1]
+    __ ldrsw(r23, Address(r21, r12, Address::lsl(0))); //       ldrsw   x23, [x21, x12, lsl #0]
+    __ ldrd(v5, Address(r13, r8, Address::lsl(3)));    //       ldr     d5, [x13, x8, lsl #3]
+    __ ldrs(v3, Address(r10, r22, Address::lsl(2)));   //       ldr     s3, [x10, x22, lsl #2]
+    __ strd(v14, Address(r2, r27, Address::sxtw(0)));  //       str     d14, [x2, w27, sxtw #0]
+    __ strs(v20, Address(r6, r25, Address::lsl(0)));   //       str     s20, [x6, x25, lsl #0]
+
+// base_plus_scaled_offset
+// LoadStoreOp
+    __ str(r30, Address(r7, 16256));                   //       str     x30, [x7, 16256]
+    __ strw(r15, Address(r8, 7588));                   //       str     w15, [x8, 7588]
+    __ strb(r11, Address(r0, 1866));                   //       strb    w11, [x0, 1866]
+    __ strh(r3, Address(r17, 3734));                   //       strh    w3, [x17, 3734]
+    __ ldr(r2, Address(r7, 14224));                    //       ldr     x2, [x7, 14224]
+    __ ldrw(r5, Address(r9, 7396));                    //       ldr     w5, [x9, 7396]
+    __ ldrb(r28, Address(r9, 1721));                   //       ldrb    w28, [x9, 1721]
+    __ ldrh(r2, Address(r20, 3656));                   //       ldrh    w2, [x20, 3656]
+    __ ldrsb(r22, Address(r14, 1887));                 //       ldrsb   x22, [x14, 1887]
+    __ ldrsh(r8, Address(r0, 4080));                   //       ldrsh   x8, [x0, 4080]
+    __ ldrshw(r0, Address(r30, 3916));                 //       ldrsh   w0, [x30, 3916]
+    __ ldrsw(r24, Address(r19, 6828));                 //       ldrsw   x24, [x19, 6828]
+    __ ldrd(v24, Address(r12, 13032));                 //       ldr     d24, [x12, 13032]
+    __ ldrs(v8, Address(r8, 7452));                    //       ldr     s8, [x8, 7452]
+    __ strd(v10, Address(r15, 15992));                 //       str     d10, [x15, 15992]
+    __ strs(v26, Address(r19, 6688));                  //       str     s26, [x19, 6688]
+
+// pcrel
+// LoadStoreOp
+    __ ldr(r10, forth);                                //       ldr     x10, forth
+    __ ldrw(r3, __ pc());                              //       ldr     w3, .
+
+// LoadStoreOp
+    __ prfm(Address(r23, 9));                          //       prfm    PLDL1KEEP, [x23, 9]
+
+// LoadStoreOp
+    __ prfm(back);                                     //       prfm    PLDL1KEEP, back
+
+// LoadStoreOp
+    __ prfm(Address(r3, r8, Address::uxtw(0)));        //       prfm    PLDL1KEEP, [x3, w8, uxtw #0]
+
+// LoadStoreOp
+    __ prfm(Address(r11, 15080));                      //       prfm    PLDL1KEEP, [x11, 15080]
+
+// AddSubCarryOp
+    __ adcw(r13, r9, r28);                             //       adc     w13, w9, w28
+    __ adcsw(r27, r19, r28);                           //       adcs    w27, w19, w28
+    __ sbcw(r19, r18, r6);                             //       sbc     w19, w18, w6
+    __ sbcsw(r14, r20, r3);                            //       sbcs    w14, w20, w3
+    __ adc(r16, r14, r8);                              //       adc     x16, x14, x8
+    __ adcs(r0, r29, r8);                              //       adcs    x0, x29, x8
+    __ sbc(r8, r24, r20);                              //       sbc     x8, x24, x20
+    __ sbcs(r12, r28, r0);                             //       sbcs    x12, x28, x0
+
+// AddSubExtendedOp
+    __ addw(r23, r6, r16, ext::uxtb, 4);               //       add     w23, w6, w16, uxtb #4
+    __ addsw(r25, r25, r23, ext::sxth, 2);             //       adds    w25, w25, w23, sxth #2
+    __ sub(r26, r22, r4, ext::uxtx, 1);                //       sub     x26, x22, x4, uxtx #1
+    __ subsw(r17, r29, r19, ext::sxtx, 3);             //       subs    w17, w29, w19, sxtx #3
+    __ add(r11, r30, r21, ext::uxtb, 3);               //       add     x11, x30, x21, uxtb #3
+    __ adds(r16, r19, r0, ext::sxtb, 2);               //       adds    x16, x19, x0, sxtb #2
+    __ sub(r11, r9, r25, ext::sxtx, 1);                //       sub     x11, x9, x25, sxtx #1
+    __ subs(r17, r20, r12, ext::sxtb, 4);              //       subs    x17, x20, x12, sxtb #4
+
+// ConditionalCompareOp
+    __ ccmnw(r13, r11, 3u, Assembler::LE);             //       ccmn    w13, w11, #3, LE
+    __ ccmpw(r13, r12, 2u, Assembler::HI);             //       ccmp    w13, w12, #2, HI
+    __ ccmn(r3, r2, 12u, Assembler::NE);               //       ccmn    x3, x2, #12, NE
+    __ ccmp(r7, r21, 3u, Assembler::VS);               //       ccmp    x7, x21, #3, VS
+
+// ConditionalCompareImmedOp
+    __ ccmnw(r2, 14, 4, Assembler::CC);                //       ccmn    w2, #14, #4, CC
+    __ ccmpw(r17, 17, 6, Assembler::PL);               //       ccmp    w17, #17, #6, PL
+    __ ccmn(r10, 12, 0, Assembler::CS);                //       ccmn    x10, #12, #0, CS
+    __ ccmp(r21, 18, 14, Assembler::GE);               //       ccmp    x21, #18, #14, GE
+
+// ConditionalSelectOp
+    __ cselw(r21, r13, r12, Assembler::GT);            //       csel    w21, w13, w12, GT
+    __ csincw(r10, r27, r15, Assembler::LS);           //       csinc   w10, w27, w15, LS
+    __ csinvw(r0, r13, r9, Assembler::HI);             //       csinv   w0, w13, w9, HI
+    __ csnegw(r18, r4, r26, Assembler::VS);            //       csneg   w18, w4, w26, VS
+    __ csel(r12, r29, r7, Assembler::LS);              //       csel    x12, x29, x7, LS
+    __ csinc(r6, r7, r20, Assembler::VC);              //       csinc   x6, x7, x20, VC
+    __ csinv(r22, r21, r3, Assembler::LE);             //       csinv   x22, x21, x3, LE
+    __ csneg(r19, r12, r27, Assembler::LS);            //       csneg   x19, x12, x27, LS
+
+// TwoRegOp
+    __ rbitw(r0, r16);                                 //       rbit    w0, w16
+    __ rev16w(r17, r23);                               //       rev16   w17, w23
+    __ revw(r17, r14);                                 //       rev     w17, w14
+    __ clzw(r24, r30);                                 //       clz     w24, w30
+    __ clsw(r24, r22);                                 //       cls     w24, w22
+    __ rbit(r3, r17);                                  //       rbit    x3, x17
+    __ rev16(r12, r13);                                //       rev16   x12, x13
+    __ rev32(r9, r22);                                 //       rev32   x9, x22
+    __ rev(r0, r0);                                    //       rev     x0, x0
+    __ clz(r5, r16);                                   //       clz     x5, x16
+    __ cls(r25, r22);                                  //       cls     x25, x22
+
+// ThreeRegOp
+    __ udivw(r29, r4, r0);                             //       udiv    w29, w4, w0
+    __ sdivw(r0, r29, r29);                            //       sdiv    w0, w29, w29
+    __ lslvw(r5, r17, r21);                            //       lslv    w5, w17, w21
+    __ lsrvw(r9, r9, r18);                             //       lsrv    w9, w9, w18
+    __ asrvw(r1, r27, r8);                             //       asrv    w1, w27, w8
+    __ rorvw(r18, r20, r13);                           //       rorv    w18, w20, w13
+    __ udiv(r8, r25, r12);                             //       udiv    x8, x25, x12
+    __ sdiv(r7, r5, r28);                              //       sdiv    x7, x5, x28
+    __ lslv(r5, r17, r27);                             //       lslv    x5, x17, x27
+    __ lsrv(r23, r26, r20);                            //       lsrv    x23, x26, x20
+    __ asrv(r28, r8, r28);                             //       asrv    x28, x8, x28
+    __ rorv(r3, r29, r4);                              //       rorv    x3, x29, x4
+
+// FourRegMulOp
+    __ maddw(r17, r14, r26, r21);                      //       madd    w17, w14, w26, w21
+    __ msubw(r1, r30, r11, r11);                       //       msub    w1, w30, w11, w11
+    __ madd(r1, r17, r6, r28);                         //       madd    x1, x17, x6, x28
+    __ msub(r30, r6, r30, r8);                         //       msub    x30, x6, x30, x8
+    __ smaddl(r21, r6, r14, r8);                       //       smaddl  x21, w6, w14, x8
+    __ smsubl(r10, r10, r24, r19);                     //       smsubl  x10, w10, w24, x19
+    __ umaddl(r20, r18, r14, r24);                     //       umaddl  x20, w18, w14, x24
+    __ umsubl(r18, r2, r5, r5);                        //       umsubl  x18, w2, w5, x5
+
+// ThreeRegFloatOp
+    __ fmuls(v8, v18, v13);                            //       fmul    s8, s18, s13
+    __ fdivs(v2, v14, v28);                            //       fdiv    s2, s14, s28
+    __ fadds(v15, v12, v28);                           //       fadd    s15, s12, s28
+    __ fsubs(v0, v12, v1);                             //       fsub    s0, s12, s1
+    __ fmuls(v15, v29, v4);                            //       fmul    s15, s29, s4
+    __ fmuld(v12, v1, v23);                            //       fmul    d12, d1, d23
+    __ fdivd(v27, v8, v18);                            //       fdiv    d27, d8, d18
+    __ faddd(v23, v20, v11);                           //       fadd    d23, d20, d11
+    __ fsubd(v8, v12, v18);                            //       fsub    d8, d12, d18
+    __ fmuld(v26, v24, v23);                           //       fmul    d26, d24, d23
+
+// FourRegFloatOp
+    __ fmadds(v21, v23, v13, v25);                     //       fmadd   s21, s23, s13, s25
+    __ fmsubs(v22, v10, v1, v14);                      //       fmsub   s22, s10, s1, s14
+    __ fnmadds(v14, v20, v2, v30);                     //       fnmadd  s14, s20, s2, s30
+    __ fnmadds(v7, v29, v22, v22);                     //       fnmadd  s7, s29, s22, s22
+    __ fmaddd(v13, v5, v15, v5);                       //       fmadd   d13, d5, d15, d5
+    __ fmsubd(v14, v12, v5, v10);                      //       fmsub   d14, d12, d5, d10
+    __ fnmaddd(v10, v19, v0, v1);                      //       fnmadd  d10, d19, d0, d1
+    __ fnmaddd(v20, v2, v2, v0);                       //       fnmadd  d20, d2, d2, d0
+
+// TwoRegFloatOp
+    __ fmovs(v25, v9);                                 //       fmov    s25, s9
+    __ fabss(v20, v4);                                 //       fabs    s20, s4
+    __ fnegs(v3, v27);                                 //       fneg    s3, s27
+    __ fsqrts(v1, v2);                                 //       fsqrt   s1, s2
+    __ fcvts(v30, v0);                                 //       fcvt    d30, s0
+    __ fmovd(v12, v4);                                 //       fmov    d12, d4
+    __ fabsd(v1, v27);                                 //       fabs    d1, d27
+    __ fnegd(v8, v22);                                 //       fneg    d8, d22
+    __ fsqrtd(v11, v11);                               //       fsqrt   d11, d11
+    __ fcvtd(v22, v28);                                //       fcvt    s22, d28
+
+// FloatConvertOp
+    __ fcvtzsw(r28, v22);                              //       fcvtzs  w28, s22
+    __ fcvtzs(r20, v27);                               //       fcvtzs  x20, s27
+    __ fcvtzdw(r14, v0);                               //       fcvtzs  w14, d0
+    __ fcvtzd(r26, v11);                               //       fcvtzs  x26, d11
+    __ scvtfws(v28, r22);                              //       scvtf   s28, w22
+    __ scvtfs(v16, r10);                               //       scvtf   s16, x10
+    __ scvtfwd(v8, r21);                               //       scvtf   d8, w21
+    __ scvtfd(v21, r28);                               //       scvtf   d21, x28
+    __ fmovs(r24, v24);                                //       fmov    w24, s24
+    __ fmovd(r8, v19);                                 //       fmov    x8, d19
+    __ fmovs(v8, r12);                                 //       fmov    s8, w12
+    __ fmovd(v6, r7);                                  //       fmov    d6, x7
+
+// TwoRegFloatOp
+    __ fcmps(v30, v16);                                //       fcmp    s30, s16
+    __ fcmpd(v25, v11);                                //       fcmp    d25, d11
+    __ fcmps(v11, 0.0);                                //       fcmp    s11, #0.0
+    __ fcmpd(v11, 0.0);                                //       fcmp    d11, #0.0
+
+// LoadStorePairOp
+    __ stpw(r29, r12, Address(r17, 128));              //       stp     w29, w12, [x17, #128]
+    __ ldpw(r22, r18, Address(r14, -96));              //       ldp     w22, w18, [x14, #-96]
+    __ ldpsw(r11, r16, Address(r1, 64));               //       ldpsw   x11, x16, [x1, #64]
+    __ stp(r0, r11, Address(r26, 112));                //       stp     x0, x11, [x26, #112]
+    __ ldp(r7, r1, Address(r26, 16));                  //       ldp     x7, x1, [x26, #16]
+
+// LoadStorePairOp
+    __ stpw(r10, r7, Address(__ pre(r24, 0)));         //       stp     w10, w7, [x24, #0]!
+    __ ldpw(r7, r28, Address(__ pre(r24, -256)));      //       ldp     w7, w28, [x24, #-256]!
+    __ ldpsw(r25, r28, Address(__ pre(r21, -240)));    //       ldpsw   x25, x28, [x21, #-240]!
+    __ stp(r20, r18, Address(__ pre(r14, -16)));       //       stp     x20, x18, [x14, #-16]!
+    __ ldp(r8, r10, Address(__ pre(r13, 80)));         //       ldp     x8, x10, [x13, #80]!
+
+// LoadStorePairOp
+    __ stpw(r26, r24, Address(__ post(r2, -128)));     //       stp     w26, w24, [x2], #-128
+    __ ldpw(r2, r25, Address(__ post(r21, -192)));     //       ldp     w2, w25, [x21], #-192
+    __ ldpsw(r17, r2, Address(__ post(r21, -144)));    //       ldpsw   x17, x2, [x21], #-144
+    __ stp(r12, r10, Address(__ post(r11, 96)));       //       stp     x12, x10, [x11], #96
+    __ ldp(r24, r6, Address(__ post(r17, -32)));       //       ldp     x24, x6, [x17], #-32
+
+// LoadStorePairOp
+    __ stnpw(r3, r30, Address(r14, -224));             //       stnp    w3, w30, [x14, #-224]
+    __ ldnpw(r15, r20, Address(r26, -144));            //       ldnp    w15, w20, [x26, #-144]
+    __ stnp(r22, r25, Address(r12, -128));             //       stnp    x22, x25, [x12, #-128]
+    __ ldnp(r27, r22, Address(r17, -176));             //       ldnp    x27, x22, [x17, #-176]
+
+// FloatImmediateOp
+    __ fmovd(v0, 2.0);                                 //       fmov d0, #2.0
+    __ fmovd(v0, 2.125);                               //       fmov d0, #2.125
+    __ fmovd(v0, 4.0);                                 //       fmov d0, #4.0
+    __ fmovd(v0, 4.25);                                //       fmov d0, #4.25
+    __ fmovd(v0, 8.0);                                 //       fmov d0, #8.0
+    __ fmovd(v0, 8.5);                                 //       fmov d0, #8.5
+    __ fmovd(v0, 16.0);                                //       fmov d0, #16.0
+    __ fmovd(v0, 17.0);                                //       fmov d0, #17.0
+    __ fmovd(v0, 0.125);                               //       fmov d0, #0.125
+    __ fmovd(v0, 0.1328125);                           //       fmov d0, #0.1328125
+    __ fmovd(v0, 0.25);                                //       fmov d0, #0.25
+    __ fmovd(v0, 0.265625);                            //       fmov d0, #0.265625
+    __ fmovd(v0, 0.5);                                 //       fmov d0, #0.5
+    __ fmovd(v0, 0.53125);                             //       fmov d0, #0.53125
+    __ fmovd(v0, 1.0);                                 //       fmov d0, #1.0
+    __ fmovd(v0, 1.0625);                              //       fmov d0, #1.0625
+    __ fmovd(v0, -2.0);                                //       fmov d0, #-2.0
+    __ fmovd(v0, -2.125);                              //       fmov d0, #-2.125
+    __ fmovd(v0, -4.0);                                //       fmov d0, #-4.0
+    __ fmovd(v0, -4.25);                               //       fmov d0, #-4.25
+    __ fmovd(v0, -8.0);                                //       fmov d0, #-8.0
+    __ fmovd(v0, -8.5);                                //       fmov d0, #-8.5
+    __ fmovd(v0, -16.0);                               //       fmov d0, #-16.0
+    __ fmovd(v0, -17.0);                               //       fmov d0, #-17.0
+    __ fmovd(v0, -0.125);                              //       fmov d0, #-0.125
+    __ fmovd(v0, -0.1328125);                          //       fmov d0, #-0.1328125
+    __ fmovd(v0, -0.25);                               //       fmov d0, #-0.25
+    __ fmovd(v0, -0.265625);                           //       fmov d0, #-0.265625
+    __ fmovd(v0, -0.5);                                //       fmov d0, #-0.5
+    __ fmovd(v0, -0.53125);                            //       fmov d0, #-0.53125
+    __ fmovd(v0, -1.0);                                //       fmov d0, #-1.0
+    __ fmovd(v0, -1.0625);                             //       fmov d0, #-1.0625
+
+    __ bind(forth);
+
+/*
+aarch64ops.o:     file format elf64-littleaarch64
+
+
+Disassembly of section .text:
+
+0000000000000000 <back>:
+   0:   8b0772d3        add     x19, x22, x7, lsl #28
+   4:   cb4a3570        sub     x16, x11, x10, lsr #13
+   8:   ab9c09bb        adds    x27, x13, x28, asr #2
+   c:   eb9aa794        subs    x20, x28, x26, asr #41
+  10:   0b934e68        add     w8, w19, w19, asr #19
+  14:   4b0a3924        sub     w4, w9, w10, lsl #14
+  18:   2b1e3568        adds    w8, w11, w30, lsl #13
+  1c:   6b132720        subs    w0, w25, w19, lsl #9
+  20:   8a154c14        and     x20, x0, x21, lsl #19
+  24:   aa1445d5        orr     x21, x14, x20, lsl #17
+  28:   ca01cf99        eor     x25, x28, x1, lsl #51
+  2c:   ea8b3f6a        ands    x10, x27, x11, asr #15
+  30:   0a8c5cb9        and     w25, w5, w12, asr #23
+  34:   2a4a11d2        orr     w18, w14, w10, lsr #4
+  38:   4a855aa4        eor     w4, w21, w5, asr #22
+  3c:   6a857415        ands    w21, w0, w5, asr #29
+  40:   8aa697da        bic     x26, x30, x6, asr #37
+  44:   aa6d7423        orn     x3, x1, x13, lsr #29
+  48:   ca29bf80        eon     x0, x28, x9, lsl #47
+  4c:   ea3cb8bd        bics    x29, x5, x28, lsl #46
+  50:   0a675249        bic     w9, w18, w7, lsr #20
+  54:   2ab961ba        orn     w26, w13, w25, asr #24
+  58:   4a331899        eon     w25, w4, w19, lsl #6
+  5c:   6a646345        bics    w5, w26, w4, lsr #24
+  60:   11055267        add     w7, w19, #0x154
+  64:   31064408        adds    w8, w0, #0x191
+  68:   51028e9d        sub     w29, w20, #0xa3
+  6c:   710bdee8        subs    w8, w23, #0x2f7
+  70:   91082d81        add     x1, x12, #0x20b
+  74:   b106a962        adds    x2, x11, #0x1aa
+  78:   d10b33ae        sub     x14, x29, #0x2cc
+  7c:   f10918ab        subs    x11, x5, #0x246
+  80:   121102d7        and     w23, w22, #0x8000
+  84:   3204cd44        orr     w4, w10, #0xf0f0f0f0
+  88:   5204cf00        eor     w0, w24, #0xf0f0f0f0
+  8c:   72099fb3        ands    w19, w29, #0x7f807f80
+  90:   92729545        and     x5, x10, #0xfffffffffc000
+  94:   b20e37cc        orr     x12, x30, #0xfffc0000fffc0000
+  98:   d27c34be        eor     x30, x5, #0x3fff0
+  9c:   f27e4efa        ands    x26, x23, #0x3ffffc
+  a0:   14000000        b       a0 <back+0xa0>
+  a4:   17ffffd7        b       0 <back>
+  a8:   1400017f        b       6a4 <forth>
+  ac:   94000000        bl      ac <back+0xac>
+  b0:   97ffffd4        bl      0 <back>
+  b4:   9400017c        bl      6a4 <forth>
+  b8:   3400000c        cbz     w12, b8 <back+0xb8>
+  bc:   34fffa2c        cbz     w12, 0 <back>
+  c0:   34002f2c        cbz     w12, 6a4 <forth>
+  c4:   35000014        cbnz    w20, c4 <back+0xc4>
+  c8:   35fff9d4        cbnz    w20, 0 <back>
+  cc:   35002ed4        cbnz    w20, 6a4 <forth>
+  d0:   b400000c        cbz     x12, d0 <back+0xd0>
+  d4:   b4fff96c        cbz     x12, 0 <back>
+  d8:   b4002e6c        cbz     x12, 6a4 <forth>
+  dc:   b5000018        cbnz    x24, dc <back+0xdc>
+  e0:   b5fff918        cbnz    x24, 0 <back>
+  e4:   b5002e18        cbnz    x24, 6a4 <forth>
+  e8:   10000006        adr     x6, e8 <back+0xe8>
+  ec:   10fff8a6        adr     x6, 0 <back>
+  f0:   10002da6        adr     x6, 6a4 <forth>
+  f4:   90000015        adrp    x21, 0 <back>
+  f8:   36080001        tbz     w1, #1, f8 <back+0xf8>
+  fc:   360ff821        tbz     w1, #1, 0 <back>
+ 100:   36082d21        tbz     w1, #1, 6a4 <forth>
+ 104:   37480008        tbnz    w8, #9, 104 <back+0x104>
+ 108:   374ff7c8        tbnz    w8, #9, 0 <back>
+ 10c:   37482cc8        tbnz    w8, #9, 6a4 <forth>
+ 110:   128b50ec        movn    w12, #0x5a87
+ 114:   52a9ff8b        movz    w11, #0x4ffc, lsl #16
+ 118:   7281d095        movk    w21, #0xe84
+ 11c:   92edfebd        movn    x29, #0x6ff5, lsl #48
+ 120:   d28361e3        movz    x3, #0x1b0f
+ 124:   f2a4cc96        movk    x22, #0x2664, lsl #16
+ 128:   9346590c        sbfx    x12, x8, #6, #17
+ 12c:   33194f33        bfi     w19, w25, #7, #20
+ 130:   531d3d89        ubfiz   w9, w12, #3, #16
+ 134:   9350433c        sbfx    x28, x25, #16, #1
+ 138:   b34464ac        bfxil   x12, x5, #4, #22
+ 13c:   d3462140        ubfx    x0, x10, #6, #3
+ 140:   139a61a4        extr    w4, w13, w26, #24
+ 144:   93d87fd7        extr    x23, x30, x24, #31
+ 148:   54000000        b.eq    148 <back+0x148>
+ 14c:   54fff5a0        b.eq    0 <back>
+ 150:   54002aa0        b.eq    6a4 <forth>
+ 154:   54000001        b.ne    154 <back+0x154>
+ 158:   54fff541        b.ne    0 <back>
+ 15c:   54002a41        b.ne    6a4 <forth>
+ 160:   54000002        b.cs    160 <back+0x160>
+ 164:   54fff4e2        b.cs    0 <back>
+ 168:   540029e2        b.cs    6a4 <forth>
+ 16c:   54000002        b.cs    16c <back+0x16c>
+ 170:   54fff482        b.cs    0 <back>
+ 174:   54002982        b.cs    6a4 <forth>
+ 178:   54000003        b.cc    178 <back+0x178>
+ 17c:   54fff423        b.cc    0 <back>
+ 180:   54002923        b.cc    6a4 <forth>
+ 184:   54000003        b.cc    184 <back+0x184>
+ 188:   54fff3c3        b.cc    0 <back>
+ 18c:   540028c3        b.cc    6a4 <forth>
+ 190:   54000004        b.mi    190 <back+0x190>
+ 194:   54fff364        b.mi    0 <back>
+ 198:   54002864        b.mi    6a4 <forth>
+ 19c:   54000005        b.pl    19c <back+0x19c>
+ 1a0:   54fff305        b.pl    0 <back>
+ 1a4:   54002805        b.pl    6a4 <forth>
+ 1a8:   54000006        b.vs    1a8 <back+0x1a8>
+ 1ac:   54fff2a6        b.vs    0 <back>
+ 1b0:   540027a6        b.vs    6a4 <forth>
+ 1b4:   54000007        b.vc    1b4 <back+0x1b4>
+ 1b8:   54fff247        b.vc    0 <back>
+ 1bc:   54002747        b.vc    6a4 <forth>
+ 1c0:   54000008        b.hi    1c0 <back+0x1c0>
+ 1c4:   54fff1e8        b.hi    0 <back>
+ 1c8:   540026e8        b.hi    6a4 <forth>
+ 1cc:   54000009        b.ls    1cc <back+0x1cc>
+ 1d0:   54fff189        b.ls    0 <back>
+ 1d4:   54002689        b.ls    6a4 <forth>
+ 1d8:   5400000a        b.ge    1d8 <back+0x1d8>
+ 1dc:   54fff12a        b.ge    0 <back>
+ 1e0:   5400262a        b.ge    6a4 <forth>
+ 1e4:   5400000b        b.lt    1e4 <back+0x1e4>
+ 1e8:   54fff0cb        b.lt    0 <back>
+ 1ec:   540025cb        b.lt    6a4 <forth>
+ 1f0:   5400000c        b.gt    1f0 <back+0x1f0>
+ 1f4:   54fff06c        b.gt    0 <back>
+ 1f8:   5400256c        b.gt    6a4 <forth>
+ 1fc:   5400000d        b.le    1fc <back+0x1fc>
+ 200:   54fff00d        b.le    0 <back>
+ 204:   5400250d        b.le    6a4 <forth>
+ 208:   5400000e        b.al    208 <back+0x208>
+ 20c:   54ffefae        b.al    0 <back>
+ 210:   540024ae        b.al    6a4 <forth>
+ 214:   5400000f        b.nv    214 <back+0x214>
+ 218:   54ffef4f        b.nv    0 <back>
+ 21c:   5400244f        b.nv    6a4 <forth>
+ 220:   d4063721        svc     #0x31b9
+ 224:   d4035082        hvc     #0x1a84
+ 228:   d400bfe3        smc     #0x5ff
+ 22c:   d4282fc0        brk     #0x417e
+ 230:   d444c320        hlt     #0x2619
+ 234:   d503201f        nop
+ 238:   d69f03e0        eret
+ 23c:   d6bf03e0        drps
+ 240:   d5033fdf        isb
+ 244:   d5033f9f        dsb     sy
+ 248:   d5033abf        dmb     ishst
+ 24c:   d61f0040        br      x2
+ 250:   d63f00a0        blr     x5
+ 254:   c8147c55        stxr    w20, x21, [x2]
+ 258:   c807fcfd        stlxr   w7, x29, [x7]
+ 25c:   c85f7e05        ldxr    x5, [x16]
+ 260:   c85fffbb        ldaxr   x27, [x29]
+ 264:   c89fffa0        stlr    x0, [x29]
+ 268:   c8dfff95        ldar    x21, [x28]
+ 26c:   88187cf8        stxr    w24, w24, [x7]
+ 270:   8815ff9a        stlxr   w21, w26, [x28]
+ 274:   885f7cd5        ldxr    w21, [x6]
+ 278:   885fffcf        ldaxr   w15, [x30]
+ 27c:   889ffc73        stlr    w19, [x3]
+ 280:   88dffc56        ldar    w22, [x2]
+ 284:   48127c0f        stxrh   w18, w15, [x0]
+ 288:   480bff85        stlxrh  w11, w5, [x28]
+ 28c:   485f7cdd        ldxrh   w29, [x6]
+ 290:   485ffcf2        ldaxrh  w18, [x7]
+ 294:   489fff99        stlrh   w25, [x28]
+ 298:   48dffe62        ldarh   w2, [x19]
+ 29c:   080a7c3e        stxrb   w10, w30, [x1]
+ 2a0:   0814fed5        stlxrb  w20, w21, [x22]
+ 2a4:   085f7c59        ldxrb   w25, [x2]
+ 2a8:   085ffcb8        ldaxrb  w24, [x5]
+ 2ac:   089ffc70        stlrb   w16, [x3]
+ 2b0:   08dfffb6        ldarb   w22, [x29]
+ 2b4:   c87f0a68        ldxp    x8, x2, [x19]
+ 2b8:   c87fcdc7        ldaxp   x7, x19, [x14]
+ 2bc:   c82870bb        stxp    w8, x27, x28, [x5]
+ 2c0:   c826b8c8        stlxp   w6, x8, x14, [x6]
+ 2c4:   887f12d9        ldxp    w25, w4, [x22]
+ 2c8:   887fb9ee        ldaxp   w14, w14, [x15]
+ 2cc:   8834215a        stxp    w20, w26, w8, [x10]
+ 2d0:   8837ca52        stlxp   w23, w18, w18, [x18]
+ 2d4:   f806317e        str     x30, [x11,#99]
+ 2d8:   b81b3337        str     w23, [x25,#-77]
+ 2dc:   39000dc2        strb    w2, [x14,#3]
+ 2e0:   78005149        strh    w9, [x10,#5]
+ 2e4:   f84391f4        ldr     x20, [x15,#57]
+ 2e8:   b85b220c        ldr     w12, [x16,#-78]
+ 2ec:   385fd356        ldrb    w22, [x26,#-3]
+ 2f0:   785d127e        ldrh    w30, [x19,#-47]
+ 2f4:   389f4149        ldrsb   x9, [x10,#-12]
+ 2f8:   79801e3c        ldrsh   x28, [x17,#14]
+ 2fc:   79c014a3        ldrsh   w3, [x5,#10]
+ 300:   b89a5231        ldrsw   x17, [x17,#-91]
+ 304:   fc5ef282        ldr     d2, [x20,#-17]
+ 308:   bc5f60f6        ldr     s22, [x7,#-10]
+ 30c:   fc12125e        str     d30, [x18,#-223]
+ 310:   bc0152cd        str     s13, [x22,#21]
+ 314:   f8190e49        str     x9, [x18,#-112]!
+ 318:   b800befd        str     w29, [x23,#11]!
+ 31c:   381ffd92        strb    w18, [x12,#-1]!
+ 320:   781e9e90        strh    w16, [x20,#-23]!
+ 324:   f8409fa3        ldr     x3, [x29,#9]!
+ 328:   b8413c79        ldr     w25, [x3,#19]!
+ 32c:   385fffa1        ldrb    w1, [x29,#-1]!
+ 330:   785c7fa8        ldrh    w8, [x29,#-57]!
+ 334:   389f3dc5        ldrsb   x5, [x14,#-13]!
+ 338:   78801f6a        ldrsh   x10, [x27,#1]!
+ 33c:   78c19d4b        ldrsh   w11, [x10,#25]!
+ 340:   b89a4ec4        ldrsw   x4, [x22,#-92]!
+ 344:   fc408eeb        ldr     d11, [x23,#8]!
+ 348:   bc436e79        ldr     s25, [x19,#54]!
+ 34c:   fc152ce1        str     d1, [x7,#-174]!
+ 350:   bc036f28        str     s8, [x25,#54]!
+ 354:   f8025565        str     x5, [x11],#37
+ 358:   b80135f8        str     w24, [x15],#19
+ 35c:   381ff74f        strb    w15, [x26],#-1
+ 360:   781fa652        strh    w18, [x18],#-6
+ 364:   f851a447        ldr     x7, [x2],#-230
+ 368:   b85e557b        ldr     w27, [x11],#-27
+ 36c:   385e7472        ldrb    w18, [x3],#-25
+ 370:   785e070a        ldrh    w10, [x24],#-32
+ 374:   38804556        ldrsb   x22, [x10],#4
+ 378:   78819591        ldrsh   x17, [x12],#25
+ 37c:   78dc24e8        ldrsh   w8, [x7],#-62
+ 380:   b89cd6d7        ldrsw   x23, [x22],#-51
+ 384:   fc430738        ldr     d24, [x25],#48
+ 388:   bc5f6595        ldr     s21, [x12],#-10
+ 38c:   fc1225b2        str     d18, [x13],#-222
+ 390:   bc1d7430        str     s16, [x1],#-41
+ 394:   f82fcac2        str     x2, [x22,w15,sxtw]
+ 398:   b83d6a02        str     w2, [x16,x29]
+ 39c:   382e5a54        strb    w20, [x18,w14,uxtw #0]
+ 3a0:   7834fa66        strh    w6, [x19,x20,sxtx #1]
+ 3a4:   f86ecbae        ldr     x14, [x29,w14,sxtw]
+ 3a8:   b86cda90        ldr     w16, [x20,w12,sxtw #2]
+ 3ac:   3860d989        ldrb    w9, [x12,w0,sxtw #0]
+ 3b0:   78637a2c        ldrh    w12, [x17,x3,lsl #1]
+ 3b4:   38a3fa22        ldrsb   x2, [x17,x3,sxtx #0]
+ 3b8:   78b15827        ldrsh   x7, [x1,w17,uxtw #1]
+ 3bc:   78f2d9f9        ldrsh   w25, [x15,w18,sxtw #1]
+ 3c0:   b8ac6ab7        ldrsw   x23, [x21,x12]
+ 3c4:   fc6879a5        ldr     d5, [x13,x8,lsl #3]
+ 3c8:   bc767943        ldr     s3, [x10,x22,lsl #2]
+ 3cc:   fc3bc84e        str     d14, [x2,w27,sxtw]
+ 3d0:   bc3968d4        str     s20, [x6,x25]
+ 3d4:   f91fc0fe        str     x30, [x7,#16256]
+ 3d8:   b91da50f        str     w15, [x8,#7588]
+ 3dc:   391d280b        strb    w11, [x0,#1866]
+ 3e0:   791d2e23        strh    w3, [x17,#3734]
+ 3e4:   f95bc8e2        ldr     x2, [x7,#14224]
+ 3e8:   b95ce525        ldr     w5, [x9,#7396]
+ 3ec:   395ae53c        ldrb    w28, [x9,#1721]
+ 3f0:   795c9282        ldrh    w2, [x20,#3656]
+ 3f4:   399d7dd6        ldrsb   x22, [x14,#1887]
+ 3f8:   799fe008        ldrsh   x8, [x0,#4080]
+ 3fc:   79de9bc0        ldrsh   w0, [x30,#3916]
+ 400:   b99aae78        ldrsw   x24, [x19,#6828]
+ 404:   fd597598        ldr     d24, [x12,#13032]
+ 408:   bd5d1d08        ldr     s8, [x8,#7452]
+ 40c:   fd1f3dea        str     d10, [x15,#15992]
+ 410:   bd1a227a        str     s26, [x19,#6688]
+ 414:   5800148a        ldr     x10, 6a4 <forth>
+ 418:   18000003        ldr     w3, 418 <back+0x418>
+ 41c:   f88092e0        prfm    pldl1keep, [x23,#9]
+ 420:   d8ffdf00        prfm    pldl1keep, 0 <back>
+ 424:   f8a84860        prfm    pldl1keep, [x3,w8,uxtw]
+ 428:   f99d7560        prfm    pldl1keep, [x11,#15080]
+ 42c:   1a1c012d        adc     w13, w9, w28
+ 430:   3a1c027b        adcs    w27, w19, w28
+ 434:   5a060253        sbc     w19, w18, w6
+ 438:   7a03028e        sbcs    w14, w20, w3
+ 43c:   9a0801d0        adc     x16, x14, x8
+ 440:   ba0803a0        adcs    x0, x29, x8
+ 444:   da140308        sbc     x8, x24, x20
+ 448:   fa00038c        sbcs    x12, x28, x0
+ 44c:   0b3010d7        add     w23, w6, w16, uxtb #4
+ 450:   2b37ab39        adds    w25, w25, w23, sxth #2
+ 454:   cb2466da        sub     x26, x22, x4, uxtx #1
+ 458:   6b33efb1        subs    w17, w29, w19, sxtx #3
+ 45c:   8b350fcb        add     x11, x30, w21, uxtb #3
+ 460:   ab208a70        adds    x16, x19, w0, sxtb #2
+ 464:   cb39e52b        sub     x11, x9, x25, sxtx #1
+ 468:   eb2c9291        subs    x17, x20, w12, sxtb #4
+ 46c:   3a4bd1a3        ccmn    w13, w11, #0x3, le
+ 470:   7a4c81a2        ccmp    w13, w12, #0x2, hi
+ 474:   ba42106c        ccmn    x3, x2, #0xc, ne
+ 478:   fa5560e3        ccmp    x7, x21, #0x3, vs
+ 47c:   3a4e3844        ccmn    w2, #0xe, #0x4, cc
+ 480:   7a515a26        ccmp    w17, #0x11, #0x6, pl
+ 484:   ba4c2940        ccmn    x10, #0xc, #0x0, cs
+ 488:   fa52aaae        ccmp    x21, #0x12, #0xe, ge
+ 48c:   1a8cc1b5        csel    w21, w13, w12, gt
+ 490:   1a8f976a        csinc   w10, w27, w15, ls
+ 494:   5a8981a0        csinv   w0, w13, w9, hi
+ 498:   5a9a6492        csneg   w18, w4, w26, vs
+ 49c:   9a8793ac        csel    x12, x29, x7, ls
+ 4a0:   9a9474e6        csinc   x6, x7, x20, vc
+ 4a4:   da83d2b6        csinv   x22, x21, x3, le
+ 4a8:   da9b9593        csneg   x19, x12, x27, ls
+ 4ac:   5ac00200        rbit    w0, w16
+ 4b0:   5ac006f1        rev16   w17, w23
+ 4b4:   5ac009d1        rev     w17, w14
+ 4b8:   5ac013d8        clz     w24, w30
+ 4bc:   5ac016d8        cls     w24, w22
+ 4c0:   dac00223        rbit    x3, x17
+ 4c4:   dac005ac        rev16   x12, x13
+ 4c8:   dac00ac9        rev32   x9, x22
+ 4cc:   dac00c00        rev     x0, x0
+ 4d0:   dac01205        clz     x5, x16
+ 4d4:   dac016d9        cls     x25, x22
+ 4d8:   1ac0089d        udiv    w29, w4, w0
+ 4dc:   1add0fa0        sdiv    w0, w29, w29
+ 4e0:   1ad52225        lsl     w5, w17, w21
+ 4e4:   1ad22529        lsr     w9, w9, w18
+ 4e8:   1ac82b61        asr     w1, w27, w8
+ 4ec:   1acd2e92        ror     w18, w20, w13
+ 4f0:   9acc0b28        udiv    x8, x25, x12
+ 4f4:   9adc0ca7        sdiv    x7, x5, x28
+ 4f8:   9adb2225        lsl     x5, x17, x27
+ 4fc:   9ad42757        lsr     x23, x26, x20
+ 500:   9adc291c        asr     x28, x8, x28
+ 504:   9ac42fa3        ror     x3, x29, x4
+ 508:   1b1a55d1        madd    w17, w14, w26, w21
+ 50c:   1b0bafc1        msub    w1, w30, w11, w11
+ 510:   9b067221        madd    x1, x17, x6, x28
+ 514:   9b1ea0de        msub    x30, x6, x30, x8
+ 518:   9b2e20d5        smaddl  x21, w6, w14, x8
+ 51c:   9b38cd4a        smsubl  x10, w10, w24, x19
+ 520:   9bae6254        umaddl  x20, w18, w14, x24
+ 524:   9ba59452        umsubl  x18, w2, w5, x5
+ 528:   1e2d0a48        fmul    s8, s18, s13
+ 52c:   1e3c19c2        fdiv    s2, s14, s28
+ 530:   1e3c298f        fadd    s15, s12, s28
+ 534:   1e213980        fsub    s0, s12, s1
+ 538:   1e240baf        fmul    s15, s29, s4
+ 53c:   1e77082c        fmul    d12, d1, d23
+ 540:   1e72191b        fdiv    d27, d8, d18
+ 544:   1e6b2a97        fadd    d23, d20, d11
+ 548:   1e723988        fsub    d8, d12, d18
+ 54c:   1e770b1a        fmul    d26, d24, d23
+ 550:   1f0d66f5        fmadd   s21, s23, s13, s25
+ 554:   1f01b956        fmsub   s22, s10, s1, s14
+ 558:   1f227a8e        fnmadd  s14, s20, s2, s30
+ 55c:   1f365ba7        fnmadd  s7, s29, s22, s22
+ 560:   1f4f14ad        fmadd   d13, d5, d15, d5
+ 564:   1f45a98e        fmsub   d14, d12, d5, d10
+ 568:   1f60066a        fnmadd  d10, d19, d0, d1
+ 56c:   1f620054        fnmadd  d20, d2, d2, d0
+ 570:   1e204139        fmov    s25, s9
+ 574:   1e20c094        fabs    s20, s4
+ 578:   1e214363        fneg    s3, s27
+ 57c:   1e21c041        fsqrt   s1, s2
+ 580:   1e22c01e        fcvt    d30, s0
+ 584:   1e60408c        fmov    d12, d4
+ 588:   1e60c361        fabs    d1, d27
+ 58c:   1e6142c8        fneg    d8, d22
+ 590:   1e61c16b        fsqrt   d11, d11
+ 594:   1e624396        fcvt    s22, d28
+ 598:   1e3802dc        fcvtzs  w28, s22
+ 59c:   9e380374        fcvtzs  x20, s27
+ 5a0:   1e78000e        fcvtzs  w14, d0
+ 5a4:   9e78017a        fcvtzs  x26, d11
+ 5a8:   1e2202dc        scvtf   s28, w22
+ 5ac:   9e220150        scvtf   s16, x10
+ 5b0:   1e6202a8        scvtf   d8, w21
+ 5b4:   9e620395        scvtf   d21, x28
+ 5b8:   1e260318        fmov    w24, s24
+ 5bc:   9e660268        fmov    x8, d19
+ 5c0:   1e270188        fmov    s8, w12
+ 5c4:   9e6700e6        fmov    d6, x7
+ 5c8:   1e3023c0        fcmp    s30, s16
+ 5cc:   1e6b2320        fcmp    d25, d11
+ 5d0:   1e202168        fcmp    s11, #0.0
+ 5d4:   1e602168        fcmp    d11, #0.0
+ 5d8:   2910323d        stp     w29, w12, [x17,#128]
+ 5dc:   297449d6        ldp     w22, w18, [x14,#-96]
+ 5e0:   6948402b        ldpsw   x11, x16, [x1,#64]
+ 5e4:   a9072f40        stp     x0, x11, [x26,#112]
+ 5e8:   a9410747        ldp     x7, x1, [x26,#16]
+ 5ec:   29801f0a        stp     w10, w7, [x24,#0]!
+ 5f0:   29e07307        ldp     w7, w28, [x24,#-256]!
+ 5f4:   69e272b9        ldpsw   x25, x28, [x21,#-240]!
+ 5f8:   a9bf49d4        stp     x20, x18, [x14,#-16]!
+ 5fc:   a9c529a8        ldp     x8, x10, [x13,#80]!
+ 600:   28b0605a        stp     w26, w24, [x2],#-128
+ 604:   28e866a2        ldp     w2, w25, [x21],#-192
+ 608:   68ee0ab1        ldpsw   x17, x2, [x21],#-144
+ 60c:   a886296c        stp     x12, x10, [x11],#96
+ 610:   a8fe1a38        ldp     x24, x6, [x17],#-32
+ 614:   282479c3        stnp    w3, w30, [x14,#-224]
+ 618:   286e534f        ldnp    w15, w20, [x26,#-144]
+ 61c:   a8386596        stnp    x22, x25, [x12,#-128]
+ 620:   a8755a3b        ldnp    x27, x22, [x17,#-176]
+ 624:   1e601000        fmov    d0, #2.000000000000000000e+00
+ 628:   1e603000        fmov    d0, #2.125000000000000000e+00
+ 62c:   1e621000        fmov    d0, #4.000000000000000000e+00
+ 630:   1e623000        fmov    d0, #4.250000000000000000e+00
+ 634:   1e641000        fmov    d0, #8.000000000000000000e+00
+ 638:   1e643000        fmov    d0, #8.500000000000000000e+00
+ 63c:   1e661000        fmov    d0, #1.600000000000000000e+01
+ 640:   1e663000        fmov    d0, #1.700000000000000000e+01
+ 644:   1e681000        fmov    d0, #1.250000000000000000e-01
+ 648:   1e683000        fmov    d0, #1.328125000000000000e-01
+ 64c:   1e6a1000        fmov    d0, #2.500000000000000000e-01
+ 650:   1e6a3000        fmov    d0, #2.656250000000000000e-01
+ 654:   1e6c1000        fmov    d0, #5.000000000000000000e-01
+ 658:   1e6c3000        fmov    d0, #5.312500000000000000e-01
+ 65c:   1e6e1000        fmov    d0, #1.000000000000000000e+00
+ 660:   1e6e3000        fmov    d0, #1.062500000000000000e+00
+ 664:   1e701000        fmov    d0, #-2.000000000000000000e+00
+ 668:   1e703000        fmov    d0, #-2.125000000000000000e+00
+ 66c:   1e721000        fmov    d0, #-4.000000000000000000e+00
+ 670:   1e723000        fmov    d0, #-4.250000000000000000e+00
+ 674:   1e741000        fmov    d0, #-8.000000000000000000e+00
+ 678:   1e743000        fmov    d0, #-8.500000000000000000e+00
+ 67c:   1e761000        fmov    d0, #-1.600000000000000000e+01
+ 680:   1e763000        fmov    d0, #-1.700000000000000000e+01
+ 684:   1e781000        fmov    d0, #-1.250000000000000000e-01
+ 688:   1e783000        fmov    d0, #-1.328125000000000000e-01
+ 68c:   1e7a1000        fmov    d0, #-2.500000000000000000e-01
+ 690:   1e7a3000        fmov    d0, #-2.656250000000000000e-01
+ 694:   1e7c1000        fmov    d0, #-5.000000000000000000e-01
+ 698:   1e7c3000        fmov    d0, #-5.312500000000000000e-01
+ 69c:   1e7e1000        fmov    d0, #-1.000000000000000000e+00
+ 6a0:   1e7e3000        fmov    d0, #-1.062500000000000000e+00
+ */
+
+  static const unsigned int insns[] =
+  {
+    0x8b0772d3,     0xcb4a3570,     0xab9c09bb,     0xeb9aa794,
+    0x0b934e68,     0x4b0a3924,     0x2b1e3568,     0x6b132720,
+    0x8a154c14,     0xaa1445d5,     0xca01cf99,     0xea8b3f6a,
+    0x0a8c5cb9,     0x2a4a11d2,     0x4a855aa4,     0x6a857415,
+    0x8aa697da,     0xaa6d7423,     0xca29bf80,     0xea3cb8bd,
+    0x0a675249,     0x2ab961ba,     0x4a331899,     0x6a646345,
+    0x11055267,     0x31064408,     0x51028e9d,     0x710bdee8,
+    0x91082d81,     0xb106a962,     0xd10b33ae,     0xf10918ab,
+    0x121102d7,     0x3204cd44,     0x5204cf00,     0x72099fb3,
+    0x92729545,     0xb20e37cc,     0xd27c34be,     0xf27e4efa,
+    0x14000000,     0x17ffffd7,     0x1400017f,     0x94000000,
+    0x97ffffd4,     0x9400017c,     0x3400000c,     0x34fffa2c,
+    0x34002f2c,     0x35000014,     0x35fff9d4,     0x35002ed4,
+    0xb400000c,     0xb4fff96c,     0xb4002e6c,     0xb5000018,
+    0xb5fff918,     0xb5002e18,     0x10000006,     0x10fff8a6,
+    0x10002da6,     0x90000015,     0x36080001,     0x360ff821,
+    0x36082d21,     0x37480008,     0x374ff7c8,     0x37482cc8,
+    0x128b50ec,     0x52a9ff8b,     0x7281d095,     0x92edfebd,
+    0xd28361e3,     0xf2a4cc96,     0x9346590c,     0x33194f33,
+    0x531d3d89,     0x9350433c,     0xb34464ac,     0xd3462140,
+    0x139a61a4,     0x93d87fd7,     0x54000000,     0x54fff5a0,
+    0x54002aa0,     0x54000001,     0x54fff541,     0x54002a41,
+    0x54000002,     0x54fff4e2,     0x540029e2,     0x54000002,
+    0x54fff482,     0x54002982,     0x54000003,     0x54fff423,
+    0x54002923,     0x54000003,     0x54fff3c3,     0x540028c3,
+    0x54000004,     0x54fff364,     0x54002864,     0x54000005,
+    0x54fff305,     0x54002805,     0x54000006,     0x54fff2a6,
+    0x540027a6,     0x54000007,     0x54fff247,     0x54002747,
+    0x54000008,     0x54fff1e8,     0x540026e8,     0x54000009,
+    0x54fff189,     0x54002689,     0x5400000a,     0x54fff12a,
+    0x5400262a,     0x5400000b,     0x54fff0cb,     0x540025cb,
+    0x5400000c,     0x54fff06c,     0x5400256c,     0x5400000d,
+    0x54fff00d,     0x5400250d,     0x5400000e,     0x54ffefae,
+    0x540024ae,     0x5400000f,     0x54ffef4f,     0x5400244f,
+    0xd4063721,     0xd4035082,     0xd400bfe3,     0xd4282fc0,
+    0xd444c320,     0xd503201f,     0xd69f03e0,     0xd6bf03e0,
+    0xd5033fdf,     0xd5033f9f,     0xd5033abf,     0xd61f0040,
+    0xd63f00a0,     0xc8147c55,     0xc807fcfd,     0xc85f7e05,
+    0xc85fffbb,     0xc89fffa0,     0xc8dfff95,     0x88187cf8,
+    0x8815ff9a,     0x885f7cd5,     0x885fffcf,     0x889ffc73,
+    0x88dffc56,     0x48127c0f,     0x480bff85,     0x485f7cdd,
+    0x485ffcf2,     0x489fff99,     0x48dffe62,     0x080a7c3e,
+    0x0814fed5,     0x085f7c59,     0x085ffcb8,     0x089ffc70,
+    0x08dfffb6,     0xc87f0a68,     0xc87fcdc7,     0xc82870bb,
+    0xc826b8c8,     0x887f12d9,     0x887fb9ee,     0x8834215a,
+    0x8837ca52,     0xf806317e,     0xb81b3337,     0x39000dc2,
+    0x78005149,     0xf84391f4,     0xb85b220c,     0x385fd356,
+    0x785d127e,     0x389f4149,     0x79801e3c,     0x79c014a3,
+    0xb89a5231,     0xfc5ef282,     0xbc5f60f6,     0xfc12125e,
+    0xbc0152cd,     0xf8190e49,     0xb800befd,     0x381ffd92,
+    0x781e9e90,     0xf8409fa3,     0xb8413c79,     0x385fffa1,
+    0x785c7fa8,     0x389f3dc5,     0x78801f6a,     0x78c19d4b,
+    0xb89a4ec4,     0xfc408eeb,     0xbc436e79,     0xfc152ce1,
+    0xbc036f28,     0xf8025565,     0xb80135f8,     0x381ff74f,
+    0x781fa652,     0xf851a447,     0xb85e557b,     0x385e7472,
+    0x785e070a,     0x38804556,     0x78819591,     0x78dc24e8,
+    0xb89cd6d7,     0xfc430738,     0xbc5f6595,     0xfc1225b2,
+    0xbc1d7430,     0xf82fcac2,     0xb83d6a02,     0x382e5a54,
+    0x7834fa66,     0xf86ecbae,     0xb86cda90,     0x3860d989,
+    0x78637a2c,     0x38a3fa22,     0x78b15827,     0x78f2d9f9,
+    0xb8ac6ab7,     0xfc6879a5,     0xbc767943,     0xfc3bc84e,
+    0xbc3968d4,     0xf91fc0fe,     0xb91da50f,     0x391d280b,
+    0x791d2e23,     0xf95bc8e2,     0xb95ce525,     0x395ae53c,
+    0x795c9282,     0x399d7dd6,     0x799fe008,     0x79de9bc0,
+    0xb99aae78,     0xfd597598,     0xbd5d1d08,     0xfd1f3dea,
+    0xbd1a227a,     0x5800148a,     0x18000003,     0xf88092e0,
+    0xd8ffdf00,     0xf8a84860,     0xf99d7560,     0x1a1c012d,
+    0x3a1c027b,     0x5a060253,     0x7a03028e,     0x9a0801d0,
+    0xba0803a0,     0xda140308,     0xfa00038c,     0x0b3010d7,
+    0x2b37ab39,     0xcb2466da,     0x6b33efb1,     0x8b350fcb,
+    0xab208a70,     0xcb39e52b,     0xeb2c9291,     0x3a4bd1a3,
+    0x7a4c81a2,     0xba42106c,     0xfa5560e3,     0x3a4e3844,
+    0x7a515a26,     0xba4c2940,     0xfa52aaae,     0x1a8cc1b5,
+    0x1a8f976a,     0x5a8981a0,     0x5a9a6492,     0x9a8793ac,
+    0x9a9474e6,     0xda83d2b6,     0xda9b9593,     0x5ac00200,
+    0x5ac006f1,     0x5ac009d1,     0x5ac013d8,     0x5ac016d8,
+    0xdac00223,     0xdac005ac,     0xdac00ac9,     0xdac00c00,
+    0xdac01205,     0xdac016d9,     0x1ac0089d,     0x1add0fa0,
+    0x1ad52225,     0x1ad22529,     0x1ac82b61,     0x1acd2e92,
+    0x9acc0b28,     0x9adc0ca7,     0x9adb2225,     0x9ad42757,
+    0x9adc291c,     0x9ac42fa3,     0x1b1a55d1,     0x1b0bafc1,
+    0x9b067221,     0x9b1ea0de,     0x9b2e20d5,     0x9b38cd4a,
+    0x9bae6254,     0x9ba59452,     0x1e2d0a48,     0x1e3c19c2,
+    0x1e3c298f,     0x1e213980,     0x1e240baf,     0x1e77082c,
+    0x1e72191b,     0x1e6b2a97,     0x1e723988,     0x1e770b1a,
+    0x1f0d66f5,     0x1f01b956,     0x1f227a8e,     0x1f365ba7,
+    0x1f4f14ad,     0x1f45a98e,     0x1f60066a,     0x1f620054,
+    0x1e204139,     0x1e20c094,     0x1e214363,     0x1e21c041,
+    0x1e22c01e,     0x1e60408c,     0x1e60c361,     0x1e6142c8,
+    0x1e61c16b,     0x1e624396,     0x1e3802dc,     0x9e380374,
+    0x1e78000e,     0x9e78017a,     0x1e2202dc,     0x9e220150,
+    0x1e6202a8,     0x9e620395,     0x1e260318,     0x9e660268,
+    0x1e270188,     0x9e6700e6,     0x1e3023c0,     0x1e6b2320,
+    0x1e202168,     0x1e602168,     0x2910323d,     0x297449d6,
+    0x6948402b,     0xa9072f40,     0xa9410747,     0x29801f0a,
+    0x29e07307,     0x69e272b9,     0xa9bf49d4,     0xa9c529a8,
+    0x28b0605a,     0x28e866a2,     0x68ee0ab1,     0xa886296c,
+    0xa8fe1a38,     0x282479c3,     0x286e534f,     0xa8386596,
+    0xa8755a3b,     0x1e601000,     0x1e603000,     0x1e621000,
+    0x1e623000,     0x1e641000,     0x1e643000,     0x1e661000,
+    0x1e663000,     0x1e681000,     0x1e683000,     0x1e6a1000,
+    0x1e6a3000,     0x1e6c1000,     0x1e6c3000,     0x1e6e1000,
+    0x1e6e3000,     0x1e701000,     0x1e703000,     0x1e721000,
+    0x1e723000,     0x1e741000,     0x1e743000,     0x1e761000,
+    0x1e763000,     0x1e781000,     0x1e783000,     0x1e7a1000,
+    0x1e7a3000,     0x1e7c1000,     0x1e7c3000,     0x1e7e1000,
+    0x1e7e3000,
+  };
+// END  Generated code -- do not edit
+
+  {
+    bool ok = true;
+    unsigned int *insns1 = (unsigned int *)entry;
+    for (unsigned int i = 0; i < sizeof insns / sizeof insns[0]; i++) {
+      if (insns[i] != insns1[i]) {
+        ok = false;
+        printf("Ours:\n");
+        Disassembler::decode((address)&insns1[i], (address)&insns1[i+1]);
+        printf("Theirs:\n");
+        Disassembler::decode((address)&insns[i], (address)&insns[i+1]);
+        printf("\n");
+      }
+    }
+    assert(ok, "Assembler smoke test failed");
+  }
+
+#ifndef PRODUCT
+
+  address PC = __ pc();
+  __ ld1(v0, __ T16B, Address(r16)); // No offset
+  __ ld1(v0, __ T16B, __ post(r16, 0)); // Post-index
+  __ ld1(v0, __ T16B, Address(r16, r17)); //
+
+
+#endif // PRODUCT
+#endif // ASSERT
+}
+
+#undef __
+
+void Assembler::emit_data64(jlong data,
+                            relocInfo::relocType rtype,
+                            int format) {
+  if (rtype == relocInfo::none) {
+    emit_int64(data);
+  } else {
+    emit_data64(data, Relocation::spec_simple(rtype), format);
+  }
+}
+
+void Assembler::emit_data64(jlong data,
+                            RelocationHolder const& rspec,
+                            int format) {
+
+  assert(inst_mark() != NULL, "must be inside InstructionMark");
+  // Do not use AbstractAssembler::relocate, which is not intended for
+  // embedded words.  Instead, relocate to the enclosing instruction.
+  code_section()->relocate(inst_mark(), rspec, format);
+  emit_int64(data);
+}
+
+extern "C" {
+  void das(uint64_t start, int len) {
+    ResourceMark rm;
+    len <<= 2;
+    if (len < 0)
+      Disassembler::decode((address)start + len, (address)start);
+    else
+      Disassembler::decode((address)start, (address)start + len);
+  }
+
+  JNIEXPORT void das1(unsigned long insn) {
+    das(insn, 1);
+  }
+}
+
+#define gas_assert(ARG1) assert(ARG1, #ARG1)
+
+#define __ as->
+
+void Address::lea(MacroAssembler *as, Register r) const {
+  Relocation* reloc = _rspec.reloc();
+  relocInfo::relocType rtype = (relocInfo::relocType) reloc->type();
+
+  switch(_mode) {
+  case base_plus_offset: {
+    if (_offset == 0 && _base == r) // it's a nop
+      break;
+    if (_offset > 0)
+      __ add(r, _base, _offset);
+    else
+      __ sub(r, _base, -_offset);
+      break;
+  }
+  case base_plus_offset_reg: {
+    __ add(r, _base, _index, _ext.op(), MAX(_ext.shift(), 0));
+    break;
+  }
+  case literal: {
+    if (rtype == relocInfo::none)
+      __ mov(r, target());
+    else
+      __ movptr(r, (uint64_t)target());
+    break;
+  }
+  default:
+    ShouldNotReachHere();
+  }
+}
+
+void Assembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
+  ShouldNotReachHere();
+}
+
+#undef __
+
+#define starti Instruction_aarch64 do_not_use(this); set_current(&do_not_use)
+
+  void Assembler::adr(Register Rd, address adr) {
+    long offset = adr - pc();
+    int offset_lo = offset & 3;
+    offset >>= 2;
+    starti;
+    f(0, 31), f(offset_lo, 30, 29), f(0b10000, 28, 24), sf(offset, 23, 5);
+    rf(Rd, 0);
+  }
+
+  void Assembler::_adrp(Register Rd, address adr) {
+    uint64_t pc_page = (uint64_t)pc() >> 12;
+    uint64_t adr_page = (uint64_t)adr >> 12;
+    long offset = adr_page - pc_page;
+    int offset_lo = offset & 3;
+    offset >>= 2;
+    starti;
+    f(1, 31), f(offset_lo, 30, 29), f(0b10000, 28, 24), sf(offset, 23, 5);
+    rf(Rd, 0);
+  }
+
+#undef starti
+
+Address::Address(address target, relocInfo::relocType rtype) : _mode(literal){
+  _is_lval = false;
+  _target = target;
+  switch (rtype) {
+  case relocInfo::oop_type:
+  case relocInfo::metadata_type:
+    // Oops are a special case. Normally they would be their own section
+    // but in cases like icBuffer they are literals in the code stream that
+    // we don't have a section for. We use none so that we get a literal address
+    // which is always patchable.
+    break;
+  case relocInfo::external_word_type:
+    _rspec = external_word_Relocation::spec(target);
+    break;
+  case relocInfo::internal_word_type:
+    _rspec = internal_word_Relocation::spec(target);
+    break;
+  case relocInfo::opt_virtual_call_type:
+    _rspec = opt_virtual_call_Relocation::spec();
+    break;
+  case relocInfo::static_call_type:
+    _rspec = static_call_Relocation::spec();
+    break;
+  case relocInfo::runtime_call_type:
+    _rspec = runtime_call_Relocation::spec();
+    break;
+  case relocInfo::poll_type:
+  case relocInfo::poll_return_type:
+    _rspec = Relocation::spec_simple(rtype);
+    break;
+  case relocInfo::none:
+    _rspec = RelocationHolder::none;
+    break;
+  default:
+    ShouldNotReachHere();
+    break;
+  }
+}
+
+void Assembler::b(const Address &dest) {
+  code_section()->relocate(pc(), dest.rspec());
+  b(dest.target());
+}
+
+void Assembler::bl(const Address &dest) {
+  code_section()->relocate(pc(), dest.rspec());
+  bl(dest.target());
+}
+
+void Assembler::adr(Register r, const Address &dest) {
+  code_section()->relocate(pc(), dest.rspec());
+  adr(r, dest.target());
+}
+
+void Assembler::br(Condition cc, Label &L) {
+  if (L.is_bound()) {
+    br(cc, target(L));
+  } else {
+    L.add_patch_at(code(), locator());
+    br(cc, pc());
+  }
+}
+
+void Assembler::wrap_label(Label &L,
+                                 Assembler::uncond_branch_insn insn) {
+  if (L.is_bound()) {
+    (this->*insn)(target(L));
+  } else {
+    L.add_patch_at(code(), locator());
+    (this->*insn)(pc());
+  }
+}
+
+void Assembler::wrap_label(Register r, Label &L,
+                                 compare_and_branch_insn insn) {
+  if (L.is_bound()) {
+    (this->*insn)(r, target(L));
+  } else {
+    L.add_patch_at(code(), locator());
+    (this->*insn)(r, pc());
+  }
+}
+
+void Assembler::wrap_label(Register r, int bitpos, Label &L,
+                                 test_and_branch_insn insn) {
+  if (L.is_bound()) {
+    (this->*insn)(r, bitpos, target(L));
+  } else {
+    L.add_patch_at(code(), locator());
+    (this->*insn)(r, bitpos, pc());
+  }
+}
+
+void Assembler::wrap_label(Label &L, prfop op, prefetch_insn insn) {
+  if (L.is_bound()) {
+    (this->*insn)(target(L), op);
+  } else {
+    L.add_patch_at(code(), locator());
+    (this->*insn)(pc(), op);
+  }
+}
+
+// An "all-purpose" add/subtract immediate, per ARM documentation:
+// A "programmer-friendly" assembler may accept a negative immediate
+// between -(2^24 -1) and -1 inclusive, causing it to convert a
+// requested ADD operation to a SUB, or vice versa, and then encode
+// the absolute value of the immediate as for uimm24.
+void Assembler::add_sub_immediate(Register Rd, Register Rn, unsigned uimm, int op,
+                                  int negated_op) {
+  bool sets_flags = op & 1;   // this op sets flags
+  union {
+    unsigned u;
+    int imm;
+  };
+  u = uimm;
+  bool shift = false;
+  bool neg = imm < 0;
+  if (neg) {
+    imm = -imm;
+    op = negated_op;
+  }
+  assert(Rd != sp || imm % 16 == 0, "misaligned stack");
+  if (imm >= (1 << 11)
+      && ((imm >> 12) << 12 == imm)) {
+    imm >>= 12;
+    shift = true;
+  }
+  f(op, 31, 29), f(0b10001, 28, 24), f(shift, 23, 22), f(imm, 21, 10);
+
+  // add/subtract immediate ops with the S bit set treat r31 as zr;
+  // with S unset they use sp.
+  if (sets_flags)
+    zrf(Rd, 0);
+  else
+    srf(Rd, 0);
+
+  srf(Rn, 5);
+}
+
+bool Assembler::operand_valid_for_add_sub_immediate(long imm) {
+  bool shift = false;
+  unsigned long uimm = uabs(imm);
+  if (uimm < (1 << 12))
+    return true;
+  if (uimm < (1 << 24)
+      && ((uimm >> 12) << 12 == uimm)) {
+    return true;
+  }
+  return false;
+}
+
+bool Assembler::operand_valid_for_logical_immediate(bool is32, uint64_t imm) {
+  return encode_logical_immediate(is32, imm) != 0xffffffff;
+}
+
+static uint64_t doubleTo64Bits(jdouble d) {
+  union {
+    jdouble double_value;
+    uint64_t double_bits;
+  };
+
+  double_value = d;
+  return double_bits;
+}
+
+bool Assembler::operand_valid_for_float_immediate(double imm) {
+  // If imm is all zero bits we can use ZR as the source of a
+  // floating-point value.
+  if (doubleTo64Bits(imm) == 0)
+    return true;
+
+  // Otherwise try to encode imm then convert the encoded value back
+  // and make sure it's the exact same bit pattern.
+  unsigned result = encoding_for_fp_immediate(imm);
+  return doubleTo64Bits(imm) == fp_immediate_for_encoding(result, true);
+}
+
+int AbstractAssembler::code_fill_byte() {
+  return 0;
+}
+
+// n.b. this is implemented in subclass MacroAssembler
+void Assembler::bang_stack_with_offset(int offset) { Unimplemented(); }
+
+
+// these are the functions provided by the simulator which are used to
+// encode and decode logical immediates and floating point immediates
+//
+//   u_int64_t logical_immediate_for_encoding(u_int32_t encoding);
+//
+//   u_int32_t encoding_for_logical_immediate(u_int64_t immediate);
+//
+//   u_int64_t fp_immediate_for_encoding(u_int32_t imm8, int is_dp);
+//
+//   u_int32_t encoding_for_fp_immediate(float immediate);
+//
+// we currently import these from the simulator librray but the
+// definitions will need to be moved to here when we switch to real
+// hardware.
+
+// and now the routines called by the assembler which encapsulate the
+// above encode and decode functions
+
+uint32_t
+asm_util::encode_logical_immediate(bool is32, uint64_t imm)
+{
+  if (is32) {
+    /* Allow all zeros or all ones in top 32-bits, so that
+       constant expressions like ~1 are permitted. */
+    if (imm >> 32 != 0 && imm >> 32 != 0xffffffff)
+      return 0xffffffff;
+    /* Replicate the 32 lower bits to the 32 upper bits.  */
+    imm &= 0xffffffff;
+    imm |= imm << 32;
+  }
+
+  return encoding_for_logical_immediate(imm);
+}
+
+unsigned Assembler::pack(double value) {
+  float val = (float)value;
+  unsigned result = encoding_for_fp_immediate(val);
+  guarantee(unpack(result) == value,
+            "Invalid floating-point immediate operand");
+  return result;
+}
+
+// Packed operands for  Floating-point Move (immediate)
+
+static float unpack(unsigned value) {
+  union {
+    unsigned ival;
+    float val;
+  };
+  ival = fp_immediate_for_encoding(value, 0);
+  return val;
+}

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,2340 @@
+/*
+ * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_ASSEMBLER_AARCH64_HPP
+#define CPU_AARCH64_VM_ASSEMBLER_AARCH64_HPP
+
+#include "asm/register.hpp"
+
+// definitions of various symbolic names for machine registers
+
+// First intercalls between C and Java which use 8 general registers
+// and 8 floating registers
+
+// we also have to copy between x86 and ARM registers but that's a
+// secondary complication -- not all code employing C call convention
+// executes as x86 code though -- we generate some of it
+
+class Argument VALUE_OBJ_CLASS_SPEC {
+ public:
+  enum {
+    n_int_register_parameters_c   = 8,  // r0, r1, ... r7 (c_rarg0, c_rarg1, ...)
+    n_float_register_parameters_c = 8,  // v0, v1, ... v7 (c_farg0, c_farg1, ... )
+
+    n_int_register_parameters_j   = 8, // r1, ... r7, r0 (rj_rarg0, j_rarg1, ...
+    n_float_register_parameters_j = 8  // v0, v1, ... v7 (j_farg0, j_farg1, ...
+  };
+};
+
+REGISTER_DECLARATION(Register, c_rarg0, r0);
+REGISTER_DECLARATION(Register, c_rarg1, r1);
+REGISTER_DECLARATION(Register, c_rarg2, r2);
+REGISTER_DECLARATION(Register, c_rarg3, r3);
+REGISTER_DECLARATION(Register, c_rarg4, r4);
+REGISTER_DECLARATION(Register, c_rarg5, r5);
+REGISTER_DECLARATION(Register, c_rarg6, r6);
+REGISTER_DECLARATION(Register, c_rarg7, r7);
+
+REGISTER_DECLARATION(FloatRegister, c_farg0, v0);
+REGISTER_DECLARATION(FloatRegister, c_farg1, v1);
+REGISTER_DECLARATION(FloatRegister, c_farg2, v2);
+REGISTER_DECLARATION(FloatRegister, c_farg3, v3);
+REGISTER_DECLARATION(FloatRegister, c_farg4, v4);
+REGISTER_DECLARATION(FloatRegister, c_farg5, v5);
+REGISTER_DECLARATION(FloatRegister, c_farg6, v6);
+REGISTER_DECLARATION(FloatRegister, c_farg7, v7);
+
+// Symbolically name the register arguments used by the Java calling convention.
+// We have control over the convention for java so we can do what we please.
+// What pleases us is to offset the java calling convention so that when
+// we call a suitable jni method the arguments are lined up and we don't
+// have to do much shuffling. A suitable jni method is non-static and a
+// small number of arguments
+//
+//  |--------------------------------------------------------------------|
+//  | c_rarg0  c_rarg1  c_rarg2 c_rarg3 c_rarg4 c_rarg5 c_rarg6 c_rarg7  |
+//  |--------------------------------------------------------------------|
+//  | r0       r1       r2      r3      r4      r5      r6      r7       |
+//  |--------------------------------------------------------------------|
+//  | j_rarg7  j_rarg0  j_rarg1 j_rarg2 j_rarg3 j_rarg4 j_rarg5 j_rarg6  |
+//  |--------------------------------------------------------------------|
+
+
+REGISTER_DECLARATION(Register, j_rarg0, c_rarg1);
+REGISTER_DECLARATION(Register, j_rarg1, c_rarg2);
+REGISTER_DECLARATION(Register, j_rarg2, c_rarg3);
+REGISTER_DECLARATION(Register, j_rarg3, c_rarg4);
+REGISTER_DECLARATION(Register, j_rarg4, c_rarg5);
+REGISTER_DECLARATION(Register, j_rarg5, c_rarg6);
+REGISTER_DECLARATION(Register, j_rarg6, c_rarg7);
+REGISTER_DECLARATION(Register, j_rarg7, c_rarg0);
+
+// Java floating args are passed as per C
+
+REGISTER_DECLARATION(FloatRegister, j_farg0, v0);
+REGISTER_DECLARATION(FloatRegister, j_farg1, v1);
+REGISTER_DECLARATION(FloatRegister, j_farg2, v2);
+REGISTER_DECLARATION(FloatRegister, j_farg3, v3);
+REGISTER_DECLARATION(FloatRegister, j_farg4, v4);
+REGISTER_DECLARATION(FloatRegister, j_farg5, v5);
+REGISTER_DECLARATION(FloatRegister, j_farg6, v6);
+REGISTER_DECLARATION(FloatRegister, j_farg7, v7);
+
+// registers used to hold VM data either temporarily within a method
+// or across method calls
+
+// volatile (caller-save) registers
+
+// r8 is used for indirect result location return
+// we use it and r9 as scratch registers
+REGISTER_DECLARATION(Register, rscratch1, r8);
+REGISTER_DECLARATION(Register, rscratch2, r9);
+
+// current method -- must be in a call-clobbered register
+REGISTER_DECLARATION(Register, rmethod,   r12);
+
+// non-volatile (callee-save) registers are r16-29
+// of which the following are dedicated global state
+
+// link register
+REGISTER_DECLARATION(Register, lr,        r30);
+// frame pointer
+REGISTER_DECLARATION(Register, rfp,       r29);
+// current thread
+REGISTER_DECLARATION(Register, rthread,   r28);
+// base of heap
+REGISTER_DECLARATION(Register, rheapbase, r27);
+// constant pool cache
+REGISTER_DECLARATION(Register, rcpool,    r26);
+// monitors allocated on stack
+REGISTER_DECLARATION(Register, rmonitors, r25);
+// locals on stack
+REGISTER_DECLARATION(Register, rlocals,   r24);
+// bytecode pointer
+REGISTER_DECLARATION(Register, rbcp,      r22);
+// Dispatch table base
+REGISTER_DECLARATION(Register, rdispatch,      r21);
+// Java stack pointer
+REGISTER_DECLARATION(Register, esp,      r20);
+
+// TODO : x86 uses rbp to save SP in method handle code
+// we may need to do the same with fp
+// JSR 292 fixed register usages:
+//REGISTER_DECLARATION(Register, r_mh_SP_save, r29);
+
+#define assert_cond(ARG1) assert(ARG1, #ARG1)
+
+namespace asm_util {
+  uint32_t encode_logical_immediate(bool is32, uint64_t imm);
+};
+
+using namespace asm_util;
+
+
+class Assembler;
+
+class Instruction_aarch64 {
+  unsigned insn;
+#ifdef ASSERT
+  unsigned bits;
+#endif
+  Assembler *assem;
+
+public:
+
+  Instruction_aarch64(class Assembler *as) {
+#ifdef ASSERT
+    bits = 0;
+#endif
+    insn = 0;
+    assem = as;
+  }
+
+  inline ~Instruction_aarch64();
+
+  unsigned &get_insn() { return insn; }
+#ifdef ASSERT
+  unsigned &get_bits() { return bits; }
+#endif
+
+  static inline int32_t extend(unsigned val, int hi = 31, int lo = 0) {
+    union {
+      unsigned u;
+      int n;
+    };
+
+    u = val << (31 - hi);
+    n = n >> (31 - hi + lo);
+    return n;
+  }
+
+  static inline uint32_t extract(uint32_t val, int msb, int lsb) {
+    int nbits = msb - lsb + 1;
+    assert_cond(msb >= lsb);
+    uint32_t mask = (1U << nbits) - 1;
+    uint32_t result = val >> lsb;
+    result &= mask;
+    return result;
+  }
+
+  static inline int32_t sextract(uint32_t val, int msb, int lsb) {
+    uint32_t uval = extract(val, msb, lsb);
+    return extend(uval, msb - lsb);
+  }
+
+  static void patch(address a, int msb, int lsb, unsigned long val) {
+    int nbits = msb - lsb + 1;
+    guarantee(val < (1U << nbits), "Field too big for insn");
+    assert_cond(msb >= lsb);
+    unsigned mask = (1U << nbits) - 1;
+    val <<= lsb;
+    mask <<= lsb;
+    unsigned target = *(unsigned *)a;
+    target &= ~mask;
+    target |= val;
+    *(unsigned *)a = target;
+  }
+
+  static void spatch(address a, int msb, int lsb, long val) {
+    int nbits = msb - lsb + 1;
+    long chk = val >> (nbits - 1);
+    guarantee (chk == -1 || chk == 0, "Field too big for insn");
+    unsigned uval = val;
+    unsigned mask = (1U << nbits) - 1;
+    uval &= mask;
+    uval <<= lsb;
+    mask <<= lsb;
+    unsigned target = *(unsigned *)a;
+    target &= ~mask;
+    target |= uval;
+    *(unsigned *)a = target;
+  }
+
+  void f(unsigned val, int msb, int lsb) {
+    int nbits = msb - lsb + 1;
+    guarantee(val < (1U << nbits), "Field too big for insn");
+    assert_cond(msb >= lsb);
+    unsigned mask = (1U << nbits) - 1;
+    val <<= lsb;
+    mask <<= lsb;
+    insn |= val;
+    assert_cond((bits & mask) == 0);
+#ifdef ASSERT
+    bits |= mask;
+#endif
+  }
+
+  void f(unsigned val, int bit) {
+    f(val, bit, bit);
+  }
+
+  void sf(long val, int msb, int lsb) {
+    int nbits = msb - lsb + 1;
+    long chk = val >> (nbits - 1);
+    guarantee (chk == -1 || chk == 0, "Field too big for insn");
+    unsigned uval = val;
+    unsigned mask = (1U << nbits) - 1;
+    uval &= mask;
+    f(uval, lsb + nbits - 1, lsb);
+  }
+
+  void rf(Register r, int lsb) {
+    f(r->encoding_nocheck(), lsb + 4, lsb);
+  }
+
+  // reg|ZR
+  void zrf(Register r, int lsb) {
+    f(r->encoding_nocheck() - (r == zr), lsb + 4, lsb);
+  }
+
+  // reg|SP
+  void srf(Register r, int lsb) {
+    f(r == sp ? 31 : r->encoding_nocheck(), lsb + 4, lsb);
+  }
+
+  void rf(FloatRegister r, int lsb) {
+    f(r->encoding_nocheck(), lsb + 4, lsb);
+  }
+
+  unsigned get(int msb = 31, int lsb = 0) {
+    int nbits = msb - lsb + 1;
+    unsigned mask = ((1U << nbits) - 1) << lsb;
+    assert_cond(bits & mask == mask);
+    return (insn & mask) >> lsb;
+  }
+
+  void fixed(unsigned value, unsigned mask) {
+    assert_cond ((mask & bits) == 0);
+#ifdef ASSERT
+    bits |= mask;
+#endif
+    insn |= value;
+  }
+};
+
+#define starti Instruction_aarch64 do_not_use(this); set_current(&do_not_use)
+
+class PrePost {
+  int _offset;
+  Register _r;
+public:
+  PrePost(Register reg, int o) : _r(reg), _offset(o) { }
+  int offset() { return _offset; }
+  Register reg() { return _r; }
+};
+
+class Pre : public PrePost {
+public:
+  Pre(Register reg, int o) : PrePost(reg, o) { }
+};
+class Post : public PrePost {
+public:
+  Post(Register reg, int o) : PrePost(reg, o) { }
+};
+
+namespace ext
+{
+  enum operation { uxtb, uxth, uxtw, uxtx, sxtb, sxth, sxtw, sxtx };
+};
+
+// abs methods which cannot overflow and so are well-defined across
+// the entire domain of integer types.
+static inline unsigned int uabs(unsigned int n) {
+  union {
+    unsigned int result;
+    int value;
+  };
+  result = n;
+  if (value < 0) result = -result;
+  return result;
+}
+static inline unsigned long uabs(unsigned long n) {
+  union {
+    unsigned long result;
+    long value;
+  };
+  result = n;
+  if (value < 0) result = -result;
+  return result;
+}
+static inline unsigned long uabs(long n) { return uabs((unsigned long)n); }
+static inline unsigned long uabs(int n) { return uabs((unsigned int)n); }
+
+// Addressing modes
+class Address VALUE_OBJ_CLASS_SPEC {
+ public:
+
+  enum mode { no_mode, base_plus_offset, pre, post, pcrel,
+              base_plus_offset_reg, literal };
+
+  // Shift and extend for base reg + reg offset addressing
+  class extend {
+    int _option, _shift;
+    ext::operation _op;
+  public:
+    extend() { }
+    extend(int s, int o, ext::operation op) : _shift(s), _option(o), _op(op) { }
+    int option() const{ return _option; }
+    int shift() const { return _shift; }
+    ext::operation op() const { return _op; }
+  };
+  class uxtw : public extend {
+  public:
+    uxtw(int shift = -1): extend(shift, 0b010, ext::uxtw) { }
+  };
+  class lsl : public extend {
+  public:
+    lsl(int shift = -1): extend(shift, 0b011, ext::uxtx) { }
+  };
+  class sxtw : public extend {
+  public:
+    sxtw(int shift = -1): extend(shift, 0b110, ext::sxtw) { }
+  };
+  class sxtx : public extend {
+  public:
+    sxtx(int shift = -1): extend(shift, 0b111, ext::sxtx) { }
+  };
+
+ private:
+  Register _base;
+  Register _index;
+  long _offset;
+  enum mode _mode;
+  extend _ext;
+
+  RelocationHolder _rspec;
+
+  // Typically we use AddressLiterals we want to use their rval
+  // However in some situations we want the lval (effect address) of
+  // the item.  We provide a special factory for making those lvals.
+  bool _is_lval;
+
+  // If the target is far we'll need to load the ea of this to a
+  // register to reach it. Otherwise if near we can do PC-relative
+  // addressing.
+  address          _target;
+
+ public:
+  Address()
+    : _mode(no_mode) { }
+  Address(Register r)
+    : _mode(base_plus_offset), _base(r), _offset(0), _index(noreg), _target(0) { }
+  Address(Register r, int o)
+    : _mode(base_plus_offset), _base(r), _offset(o), _index(noreg), _target(0) { }
+  Address(Register r, long o)
+    : _mode(base_plus_offset), _base(r), _offset(o), _index(noreg), _target(0) { }
+  Address(Register r, unsigned long o)
+    : _mode(base_plus_offset), _base(r), _offset(o), _index(noreg), _target(0) { }
+#ifdef ASSERT
+  Address(Register r, ByteSize disp)
+    : _mode(base_plus_offset), _base(r), _offset(in_bytes(disp)),
+      _index(noreg), _target(0) { }
+#endif
+  Address(Register r, Register r1, extend ext = lsl())
+    : _mode(base_plus_offset_reg), _base(r), _index(r1),
+    _ext(ext), _offset(0), _target(0) { }
+  Address(Pre p)
+    : _mode(pre), _base(p.reg()), _offset(p.offset()) { }
+  Address(Post p)
+    : _mode(post), _base(p.reg()), _offset(p.offset()), _target(0) { }
+  Address(address target, RelocationHolder const& rspec)
+    : _mode(literal),
+      _rspec(rspec),
+      _is_lval(false),
+      _target(target)  { }
+  Address(address target, relocInfo::relocType rtype = relocInfo::external_word_type);
+  Address(Register base, RegisterOrConstant index, extend ext = lsl())
+    : _base (base),
+      _ext(ext), _offset(0), _target(0) {
+    if (index.is_register()) {
+      _mode = base_plus_offset_reg;
+      _index = index.as_register();
+    } else {
+      guarantee(ext.option() == ext::uxtx, "should be");
+      assert(index.is_constant(), "should be");
+      _mode = base_plus_offset;
+      _offset = index.as_constant() << ext.shift();
+    }
+  }
+
+  Register base() const {
+    guarantee((_mode == base_plus_offset | _mode == base_plus_offset_reg
+               | _mode == post),
+              "wrong mode");
+    return _base;
+  }
+  long offset() const {
+    return _offset;
+  }
+  Register index() const {
+    return _index;
+  }
+  mode getMode() const {
+    return _mode;
+  }
+  bool uses(Register reg) const { return _base == reg || _index == reg; }
+  address target() const { return _target; }
+  const RelocationHolder& rspec() const { return _rspec; }
+
+  void encode(Instruction_aarch64 *i) const {
+    i->f(0b111, 29, 27);
+    i->srf(_base, 5);
+
+    switch(_mode) {
+    case base_plus_offset:
+      {
+        unsigned size = i->get(31, 30);
+        unsigned mask = (1 << size) - 1;
+        if (_offset < 0 || _offset & mask)
+          {
+            i->f(0b00, 25, 24);
+            i->f(0, 21), i->f(0b00, 11, 10);
+            i->sf(_offset, 20, 12);
+          } else {
+            i->f(0b01, 25, 24);
+            i->f(_offset >> size, 21, 10);
+          }
+      }
+      break;
+
+    case base_plus_offset_reg:
+      {
+        i->f(0b00, 25, 24);
+        i->f(1, 21);
+        i->rf(_index, 16);
+        i->f(_ext.option(), 15, 13);
+        unsigned size = i->get(31, 30);
+        if (size == 0) // It's a byte
+          i->f(_ext.shift() >= 0, 12);
+        else {
+          if (_ext.shift() > 0)
+            assert(_ext.shift() == (int)size, "bad shift");
+          i->f(_ext.shift() > 0, 12);
+        }
+        i->f(0b10, 11, 10);
+      }
+      break;
+
+    case pre:
+      i->f(0b00, 25, 24);
+      i->f(0, 21), i->f(0b11, 11, 10);
+      i->sf(_offset, 20, 12);
+      break;
+
+    case post:
+      i->f(0b00, 25, 24);
+      i->f(0, 21), i->f(0b01, 11, 10);
+      i->sf(_offset, 20, 12);
+      break;
+
+    default:
+      ShouldNotReachHere();
+    }
+  }
+
+  void encode_pair(Instruction_aarch64 *i) const {
+    switch(_mode) {
+    case base_plus_offset:
+      i->f(0b010, 25, 23);
+      break;
+    case pre:
+      i->f(0b011, 25, 23);
+      break;
+    case post:
+      i->f(0b001, 25, 23);
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+
+    unsigned size; // Operand shift in 32-bit words
+
+    if (i->get(26, 26)) { // float
+      switch(i->get(31, 30)) {
+      case 0b10:
+        size = 2; break;
+      case 0b01:
+        size = 1; break;
+      case 0b00:
+        size = 0; break;
+      default:
+        ShouldNotReachHere();
+      }
+    } else {
+      size = i->get(31, 31);
+    }
+
+    size = 4 << size;
+    guarantee(_offset % size == 0, "bad offset");
+    i->sf(_offset / size, 21, 15);
+    i->srf(_base, 5);
+  }
+
+  void encode_nontemporal_pair(Instruction_aarch64 *i) const {
+    // Only base + offset is allowed
+    i->f(0b000, 25, 23);
+    unsigned size = i->get(31, 31);
+    size = 4 << size;
+    guarantee(_offset % size == 0, "bad offset");
+    i->sf(_offset / size, 21, 15);
+    i->srf(_base, 5);
+    guarantee(_mode == Address::base_plus_offset,
+              "Bad addressing mode for non-temporal op");
+  }
+
+  void lea(MacroAssembler *, Register) const;
+
+  static bool offset_ok_for_immed(long offset, int shift = 0) {
+    unsigned mask = (1 << shift) - 1;
+    if (offset < 0 || offset & mask) {
+      return (uabs(offset) < (1 << (20 - 12))); // Unscaled offset
+    } else {
+      return ((offset >> shift) < (1 << (21 - 10 + 1))); // Scaled, unsigned offset
+    }
+  }
+};
+
+// Convience classes
+class RuntimeAddress: public Address {
+
+  public:
+
+  RuntimeAddress(address target) : Address(target, relocInfo::runtime_call_type) {}
+
+};
+
+class OopAddress: public Address {
+
+  public:
+
+  OopAddress(address target) : Address(target, relocInfo::oop_type){}
+
+};
+
+class ExternalAddress: public Address {
+ private:
+  static relocInfo::relocType reloc_for_target(address target) {
+    // Sometimes ExternalAddress is used for values which aren't
+    // exactly addresses, like the card table base.
+    // external_word_type can't be used for values in the first page
+    // so just skip the reloc in that case.
+    return external_word_Relocation::can_be_relocated(target) ? relocInfo::external_word_type : relocInfo::none;
+  }
+
+ public:
+
+  ExternalAddress(address target) : Address(target, reloc_for_target(target)) {}
+
+};
+
+class InternalAddress: public Address {
+
+  public:
+
+  InternalAddress(address target) : Address(target, relocInfo::internal_word_type) {}
+};
+
+const int FPUStateSizeInWords = 32 * 2;
+typedef enum {
+  PLDL1KEEP = 0b00000, PLDL1STRM, PLDL2KEEP, PLDL2STRM, PLDL3KEEP, PLDL3STRM,
+  PSTL1KEEP = 0b10000, PSTL1STRM, PSTL2KEEP, PSTL2STRM, PSTL3KEEP, PSTL3STRM,
+  PLIL1KEEP = 0b01000, PLIL1STRM, PLIL2KEEP, PLIL2STRM, PLIL3KEEP, PLIL3STRM
+} prfop;
+
+class Assembler : public AbstractAssembler {
+
+#ifndef PRODUCT
+  static const unsigned long asm_bp;
+
+  void emit_long(jint x) {
+    if ((unsigned long)pc() == asm_bp)
+      asm volatile ("nop");
+    AbstractAssembler::emit_int32(x);
+  }
+#else
+  void emit_long(jint x) {
+    AbstractAssembler::emit_int32(x);
+  }
+#endif
+
+public:
+
+  enum { instruction_size = 4 };
+
+  Address adjust(Register base, int offset, bool preIncrement) {
+    if (preIncrement)
+      return Address(Pre(base, offset));
+    else
+      return Address(Post(base, offset));
+  }
+
+  Address pre(Register base, int offset) {
+    return adjust(base, offset, true);
+  }
+
+  Address post (Register base, int offset) {
+    return adjust(base, offset, false);
+  }
+
+  Instruction_aarch64* current;
+
+  void set_current(Instruction_aarch64* i) { current = i; }
+
+  void f(unsigned val, int msb, int lsb) {
+    current->f(val, msb, lsb);
+  }
+  void f(unsigned val, int msb) {
+    current->f(val, msb, msb);
+  }
+  void sf(long val, int msb, int lsb) {
+    current->sf(val, msb, lsb);
+  }
+  void rf(Register reg, int lsb) {
+    current->rf(reg, lsb);
+  }
+  void srf(Register reg, int lsb) {
+    current->srf(reg, lsb);
+  }
+  void zrf(Register reg, int lsb) {
+    current->zrf(reg, lsb);
+  }
+  void rf(FloatRegister reg, int lsb) {
+    current->rf(reg, lsb);
+  }
+  void fixed(unsigned value, unsigned mask) {
+    current->fixed(value, mask);
+  }
+
+  void emit() {
+    emit_long(current->get_insn());
+    assert_cond(current->get_bits() == 0xffffffff);
+    current = NULL;
+  }
+
+  typedef void (Assembler::* uncond_branch_insn)(address dest);
+  typedef void (Assembler::* compare_and_branch_insn)(Register Rt, address dest);
+  typedef void (Assembler::* test_and_branch_insn)(Register Rt, int bitpos, address dest);
+  typedef void (Assembler::* prefetch_insn)(address target, prfop);
+
+  void wrap_label(Label &L, uncond_branch_insn insn);
+  void wrap_label(Register r, Label &L, compare_and_branch_insn insn);
+  void wrap_label(Register r, int bitpos, Label &L, test_and_branch_insn insn);
+  void wrap_label(Label &L, prfop, prefetch_insn insn);
+
+  // PC-rel. addressing
+
+  void adr(Register Rd, address dest);
+  void _adrp(Register Rd, address dest);
+
+  void adr(Register Rd, const Address &dest);
+  void _adrp(Register Rd, const Address &dest);
+
+  void adr(Register Rd, Label &L) {
+    wrap_label(Rd, L, &Assembler::Assembler::adr);
+  }
+  void _adrp(Register Rd, Label &L) {
+    wrap_label(Rd, L, &Assembler::_adrp);
+  }
+
+  void adrp(Register Rd, const Address &dest, unsigned long &offset);
+
+#undef INSN
+
+  void add_sub_immediate(Register Rd, Register Rn, unsigned uimm, int op,
+                         int negated_op);
+
+  // Add/subtract (immediate)
+#define INSN(NAME, decode, negated)                                     \
+  void NAME(Register Rd, Register Rn, unsigned imm, unsigned shift) {   \
+    starti;                                                             \
+    f(decode, 31, 29), f(0b10001, 28, 24), f(shift, 23, 22), f(imm, 21, 10); \
+    zrf(Rd, 0), srf(Rn, 5);                                             \
+  }                                                                     \
+                                                                        \
+  void NAME(Register Rd, Register Rn, unsigned imm) {                   \
+    starti;                                                             \
+    add_sub_immediate(Rd, Rn, imm, decode, negated);                    \
+  }
+
+  INSN(addsw, 0b001, 0b011);
+  INSN(subsw, 0b011, 0b001);
+  INSN(adds,  0b101, 0b111);
+  INSN(subs,  0b111, 0b101);
+
+#undef INSN
+
+#define INSN(NAME, decode, negated)                     \
+  void NAME(Register Rd, Register Rn, unsigned imm) {   \
+    starti;                                             \
+    add_sub_immediate(Rd, Rn, imm, decode, negated);    \
+  }
+
+  INSN(addw, 0b000, 0b010);
+  INSN(subw, 0b010, 0b000);
+  INSN(add,  0b100, 0b110);
+  INSN(sub,  0b110, 0b100);
+
+#undef INSN
+
+ // Logical (immediate)
+#define INSN(NAME, decode, is32)                                \
+  void NAME(Register Rd, Register Rn, uint64_t imm) {           \
+    starti;                                                     \
+    uint32_t val = encode_logical_immediate(is32, imm);         \
+    f(decode, 31, 29), f(0b100100, 28, 23), f(val, 22, 10);     \
+    srf(Rd, 0), zrf(Rn, 5);                                     \
+  }
+
+  INSN(andw, 0b000, true);
+  INSN(orrw, 0b001, true);
+  INSN(eorw, 0b010, true);
+  INSN(andr,  0b100, false);
+  INSN(orr,  0b101, false);
+  INSN(eor,  0b110, false);
+
+#undef INSN
+
+#define INSN(NAME, decode, is32)                                \
+  void NAME(Register Rd, Register Rn, uint64_t imm) {           \
+    starti;                                                     \
+    uint32_t val = encode_logical_immediate(is32, imm);         \
+    f(decode, 31, 29), f(0b100100, 28, 23), f(val, 22, 10);     \
+    zrf(Rd, 0), zrf(Rn, 5);                                     \
+  }
+
+  INSN(ands, 0b111, false);
+  INSN(andsw, 0b011, true);
+
+#undef INSN
+
+  // Move wide (immediate)
+#define INSN(NAME, opcode)                                              \
+  void NAME(Register Rd, unsigned imm, unsigned shift = 0) {            \
+    assert_cond((shift/16)*16 == shift);                                \
+    starti;                                                             \
+    f(opcode, 31, 29), f(0b100101, 28, 23), f(shift/16, 22, 21),        \
+      f(imm, 20, 5);                                                    \
+    rf(Rd, 0);                                                          \
+  }
+
+  INSN(movnw, 0b000);
+  INSN(movzw, 0b010);
+  INSN(movkw, 0b011);
+  INSN(movn, 0b100);
+  INSN(movz, 0b110);
+  INSN(movk, 0b111);
+
+#undef INSN
+
+  // Bitfield
+#define INSN(NAME, opcode)                                              \
+  void NAME(Register Rd, Register Rn, unsigned immr, unsigned imms) {   \
+    starti;                                                             \
+    f(opcode, 31, 22), f(immr, 21, 16), f(imms, 15, 10);                \
+    rf(Rn, 5), rf(Rd, 0);                                               \
+  }
+
+  INSN(sbfmw, 0b0001001100);
+  INSN(bfmw,  0b0011001100);
+  INSN(ubfmw, 0b0101001100);
+  INSN(sbfm,  0b1001001101);
+  INSN(bfm,   0b1011001101);
+  INSN(ubfm,  0b1101001101);
+
+#undef INSN
+
+  // Extract
+#define INSN(NAME, opcode)                                              \
+  void NAME(Register Rd, Register Rn, Register Rm, unsigned imms) {     \
+    starti;                                                             \
+    f(opcode, 31, 21), f(imms, 15, 10);                                 \
+    rf(Rm, 16), rf(Rn, 5), rf(Rd, 0);                                   \
+  }
+
+  INSN(extrw, 0b00010011100);
+  INSN(extr,  0b10010011110);
+
+#undef INSN
+
+  // The maximum range of a branch is fixed for the AArch64
+  // architecture.  In debug mode we shrink it in order to test
+  // trampolines, but not so small that branches in the interpreter
+  // are out of range.
+  static const unsigned long branch_range = NOT_DEBUG(128 * M) DEBUG_ONLY(2 * M);
+
+  static bool reachable_from_branch_at(address branch, address target) {
+    return uabs(target - branch) < branch_range;
+  }
+
+  // Unconditional branch (immediate)
+#define INSN(NAME, opcode)                                              \
+  void NAME(address dest) {                                             \
+    starti;                                                             \
+    long offset = (dest - pc()) >> 2;                                   \
+    DEBUG_ONLY(assert(reachable_from_branch_at(pc(), dest), "debug only")); \
+    f(opcode, 31), f(0b00101, 30, 26), sf(offset, 25, 0);               \
+  }                                                                     \
+  void NAME(Label &L) {                                                 \
+    wrap_label(L, &Assembler::NAME);                                    \
+  }                                                                     \
+  void NAME(const Address &dest);
+
+  INSN(b, 0);
+  INSN(bl, 1);
+
+#undef INSN
+
+  // Compare & branch (immediate)
+#define INSN(NAME, opcode)                              \
+  void NAME(Register Rt, address dest) {                \
+    long offset = (dest - pc()) >> 2;                   \
+    starti;                                             \
+    f(opcode, 31, 24), sf(offset, 23, 5), rf(Rt, 0);    \
+  }                                                     \
+  void NAME(Register Rt, Label &L) {                    \
+    wrap_label(Rt, L, &Assembler::NAME);                \
+  }
+
+  INSN(cbzw,  0b00110100);
+  INSN(cbnzw, 0b00110101);
+  INSN(cbz,   0b10110100);
+  INSN(cbnz,  0b10110101);
+
+#undef INSN
+
+  // Test & branch (immediate)
+#define INSN(NAME, opcode)                                              \
+  void NAME(Register Rt, int bitpos, address dest) {                    \
+    long offset = (dest - pc()) >> 2;                                   \
+    int b5 = bitpos >> 5;                                               \
+    bitpos &= 0x1f;                                                     \
+    starti;                                                             \
+    f(b5, 31), f(opcode, 30, 24), f(bitpos, 23, 19), sf(offset, 18, 5); \
+    rf(Rt, 0);                                                          \
+  }                                                                     \
+  void NAME(Register Rt, int bitpos, Label &L) {                        \
+    wrap_label(Rt, bitpos, L, &Assembler::NAME);                        \
+  }
+
+  INSN(tbz,  0b0110110);
+  INSN(tbnz, 0b0110111);
+
+#undef INSN
+
+  // Conditional branch (immediate)
+  enum Condition
+    {EQ, NE, HS, CS=HS, LO, CC=LO, MI, PL, VS, VC, HI, LS, GE, LT, GT, LE, AL, NV};
+
+  void br(Condition  cond, address dest) {
+    long offset = (dest - pc()) >> 2;
+    starti;
+    f(0b0101010, 31, 25), f(0, 24), sf(offset, 23, 5), f(0, 4), f(cond, 3, 0);
+  }
+
+#define INSN(NAME, cond)                        \
+  void NAME(address dest) {                     \
+    br(cond, dest);                             \
+  }
+
+  INSN(beq, EQ);
+  INSN(bne, NE);
+  INSN(bhs, HS);
+  INSN(bcs, CS);
+  INSN(blo, LO);
+  INSN(bcc, CC);
+  INSN(bmi, MI);
+  INSN(bpl, PL);
+  INSN(bvs, VS);
+  INSN(bvc, VC);
+  INSN(bhi, HI);
+  INSN(bls, LS);
+  INSN(bge, GE);
+  INSN(blt, LT);
+  INSN(bgt, GT);
+  INSN(ble, LE);
+  INSN(bal, AL);
+  INSN(bnv, NV);
+
+  void br(Condition cc, Label &L);
+
+#undef INSN
+
+  // Exception generation
+  void generate_exception(int opc, int op2, int LL, unsigned imm) {
+    starti;
+    f(0b11010100, 31, 24);
+    f(opc, 23, 21), f(imm, 20, 5), f(op2, 4, 2), f(LL, 1, 0);
+  }
+
+#define INSN(NAME, opc, op2, LL)                \
+  void NAME(unsigned imm) {                     \
+    generate_exception(opc, op2, LL, imm);      \
+  }
+
+  INSN(svc, 0b000, 0, 0b01);
+  INSN(hvc, 0b000, 0, 0b10);
+  INSN(smc, 0b000, 0, 0b11);
+  INSN(brk, 0b001, 0, 0b00);
+  INSN(hlt, 0b010, 0, 0b00);
+  INSN(dpcs1, 0b101, 0, 0b01);
+  INSN(dpcs2, 0b101, 0, 0b10);
+  INSN(dpcs3, 0b101, 0, 0b11);
+
+#undef INSN
+
+  // System
+  void system(int op0, int op1, int CRn, int CRm, int op2,
+              Register rt = (Register)0b11111)
+  {
+    starti;
+    f(0b11010101000, 31, 21);
+    f(op0, 20, 19);
+    f(op1, 18, 16);
+    f(CRn, 15, 12);
+    f(CRm, 11, 8);
+    f(op2, 7, 5);
+    rf(rt, 0);
+  }
+
+  void hint(int imm) {
+    system(0b00, 0b011, 0b0010, imm, 0b000);
+  }
+
+  void nop() {
+    hint(0);
+  }
+  // we only provide mrs and msr for the special purpose system
+  // registers where op1 (instr[20:19]) == 11 and, (currently) only
+  // use it for FPSR n.b msr has L (instr[21]) == 0 mrs has L == 1
+
+  void msr(int op1, int CRn, int CRm, int op2, Register rt) {
+    starti;
+    f(0b1101010100011, 31, 19);
+    f(op1, 18, 16);
+    f(CRn, 15, 12);
+    f(CRm, 11, 8);
+    f(op2, 7, 5);
+    // writing zr is ok
+    zrf(rt, 0);
+  }
+
+  void mrs(int op1, int CRn, int CRm, int op2, Register rt) {
+    starti;
+    f(0b1101010100111, 31, 19);
+    f(op1, 18, 16);
+    f(CRn, 15, 12);
+    f(CRm, 11, 8);
+    f(op2, 7, 5);
+    // reading to zr is a mistake
+    rf(rt, 0);
+  }
+
+  enum barrier {OSHLD = 0b0001, OSHST, OSH, NSHLD=0b0101, NSHST, NSH,
+                ISHLD = 0b1001, ISHST, ISH, LD=0b1101, ST, SY};
+
+  void dsb(barrier imm) {
+    system(0b00, 0b011, 0b00011, imm, 0b100);
+  }
+
+  void dmb(barrier imm) {
+    system(0b00, 0b011, 0b00011, imm, 0b101);
+  }
+
+  void isb() {
+    system(0b00, 0b011, 0b00011, SY, 0b110);
+  }
+
+  void dc(Register Rt) {
+    system(0b01, 0b011, 0b0111, 0b1011, 0b001, Rt);
+  }
+
+  void ic(Register Rt) {
+    system(0b01, 0b011, 0b0111, 0b0101, 0b001, Rt);
+  }
+
+  // A more convenient access to dmb for our purposes
+  enum Membar_mask_bits {
+    // We can use ISH for a barrier because the ARM ARM says "This
+    // architecture assumes that all Processing Elements that use the
+    // same operating system or hypervisor are in the same Inner
+    // Shareable shareability domain."
+    StoreStore = ISHST,
+    LoadStore  = ISHLD,
+    LoadLoad   = ISHLD,
+    StoreLoad  = ISH,
+    AnyAny     = ISH
+  };
+
+  void membar(Membar_mask_bits order_constraint) {
+    dmb(Assembler::barrier(order_constraint));
+  }
+
+  // Unconditional branch (register)
+  void branch_reg(Register R, int opc) {
+    starti;
+    f(0b1101011, 31, 25);
+    f(opc, 24, 21);
+    f(0b11111000000, 20, 10);
+    rf(R, 5);
+    f(0b00000, 4, 0);
+  }
+
+#define INSN(NAME, opc)                         \
+  void NAME(Register R) {                       \
+    branch_reg(R, opc);                         \
+  }
+
+  INSN(br, 0b0000);
+  INSN(blr, 0b0001);
+  INSN(ret, 0b0010);
+
+  void ret(void *p); // This forces a compile-time error for ret(0)
+
+#undef INSN
+
+#define INSN(NAME, opc)                         \
+  void NAME() {                 \
+    branch_reg((Register)0b11111, opc);         \
+  }
+
+  INSN(eret, 0b0100);
+  INSN(drps, 0b0101);
+
+#undef INSN
+
+  // Load/store exclusive
+  enum operand_size { byte, halfword, word, xword };
+
+  void load_store_exclusive(Register Rs, Register Rt1, Register Rt2,
+    Register Rn, enum operand_size sz, int op, int o0) {
+    starti;
+    f(sz, 31, 30), f(0b001000, 29, 24), f(op, 23, 21);
+    rf(Rs, 16), f(o0, 15), rf(Rt2, 10), rf(Rn, 5), rf(Rt1, 0);
+  }
+
+#define INSN4(NAME, sz, op, o0) /* Four registers */                    \
+  void NAME(Register Rs, Register Rt1, Register Rt2, Register Rn) {     \
+    load_store_exclusive(Rs, Rt1, Rt2, Rn, sz, op, o0);                 \
+  }
+
+#define INSN3(NAME, sz, op, o0) /* Three registers */                   \
+  void NAME(Register Rs, Register Rt, Register Rn) {                    \
+    load_store_exclusive(Rs, Rt, (Register)0b11111, Rn, sz, op, o0);    \
+  }
+
+#define INSN2(NAME, sz, op, o0) /* Two registers */                     \
+  void NAME(Register Rt, Register Rn) {                                 \
+    load_store_exclusive((Register)0b11111, Rt, (Register)0b11111,      \
+                         Rn, sz, op, o0);                               \
+  }
+
+#define INSN_FOO(NAME, sz, op, o0) /* Three registers, encoded differently */ \
+  void NAME(Register Rt1, Register Rt2, Register Rn) {                  \
+    load_store_exclusive((Register)0b11111, Rt1, Rt2, Rn, sz, op, o0);  \
+  }
+
+  // bytes
+  INSN3(stxrb, byte, 0b000, 0);
+  INSN3(stlxrb, byte, 0b000, 1);
+  INSN2(ldxrb, byte, 0b010, 0);
+  INSN2(ldaxrb, byte, 0b010, 1);
+  INSN2(stlrb, byte, 0b100, 1);
+  INSN2(ldarb, byte, 0b110, 1);
+
+  // halfwords
+  INSN3(stxrh, halfword, 0b000, 0);
+  INSN3(stlxrh, halfword, 0b000, 1);
+  INSN2(ldxrh, halfword, 0b010, 0);
+  INSN2(ldaxrh, halfword, 0b010, 1);
+  INSN2(stlrh, halfword, 0b100, 1);
+  INSN2(ldarh, halfword, 0b110, 1);
+
+  // words
+  INSN3(stxrw, word, 0b000, 0);
+  INSN3(stlxrw, word, 0b000, 1);
+  INSN4(stxpw, word, 0b001, 0);
+  INSN4(stlxpw, word, 0b001, 1);
+  INSN2(ldxrw, word, 0b010, 0);
+  INSN2(ldaxrw, word, 0b010, 1);
+  INSN_FOO(ldxpw, word, 0b011, 0);
+  INSN_FOO(ldaxpw, word, 0b011, 1);
+  INSN2(stlrw, word, 0b100, 1);
+  INSN2(ldarw, word, 0b110, 1);
+
+  // xwords
+  INSN3(stxr, xword, 0b000, 0);
+  INSN3(stlxr, xword, 0b000, 1);
+  INSN4(stxp, xword, 0b001, 0);
+  INSN4(stlxp, xword, 0b001, 1);
+  INSN2(ldxr, xword, 0b010, 0);
+  INSN2(ldaxr, xword, 0b010, 1);
+  INSN_FOO(ldxp, xword, 0b011, 0);
+  INSN_FOO(ldaxp, xword, 0b011, 1);
+  INSN2(stlr, xword, 0b100, 1);
+  INSN2(ldar, xword, 0b110, 1);
+
+#undef INSN2
+#undef INSN3
+#undef INSN4
+#undef INSN_FOO
+
+  // Load register (literal)
+#define INSN(NAME, opc, V)                                              \
+  void NAME(Register Rt, address dest) {                                \
+    long offset = (dest - pc()) >> 2;                                   \
+    starti;                                                             \
+    f(opc, 31, 30), f(0b011, 29, 27), f(V, 26), f(0b00, 25, 24),        \
+      sf(offset, 23, 5);                                                \
+    rf(Rt, 0);                                                          \
+  }                                                                     \
+  void NAME(Register Rt, address dest, relocInfo::relocType rtype) {    \
+    InstructionMark im(this);                                           \
+    guarantee(rtype == relocInfo::internal_word_type,                   \
+              "only internal_word_type relocs make sense here");        \
+    code_section()->relocate(inst_mark(), InternalAddress(dest).rspec()); \
+    NAME(Rt, dest);                                                     \
+  }                                                                     \
+  void NAME(Register Rt, Label &L) {                                    \
+    wrap_label(Rt, L, &Assembler::NAME);                                \
+  }
+
+  INSN(ldrw, 0b00, 0);
+  INSN(ldr, 0b01, 0);
+  INSN(ldrsw, 0b10, 0);
+
+#undef INSN
+
+#define INSN(NAME, opc, V)                                              \
+  void NAME(FloatRegister Rt, address dest) {                           \
+    long offset = (dest - pc()) >> 2;                                   \
+    starti;                                                             \
+    f(opc, 31, 30), f(0b011, 29, 27), f(V, 26), f(0b00, 25, 24),        \
+      sf(offset, 23, 5);                                                \
+    rf((Register)Rt, 0);                                                \
+  }
+
+  INSN(ldrs, 0b00, 1);
+  INSN(ldrd, 0b01, 1);
+  INSN(ldrq, 0x10, 1);
+
+#undef INSN
+
+#define INSN(NAME, opc, V)                                              \
+  void NAME(address dest, prfop op = PLDL1KEEP) {                       \
+    long offset = (dest - pc()) >> 2;                                   \
+    starti;                                                             \
+    f(opc, 31, 30), f(0b011, 29, 27), f(V, 26), f(0b00, 25, 24),        \
+      sf(offset, 23, 5);                                                \
+    f(op, 4, 0);                                                        \
+  }                                                                     \
+  void NAME(Label &L, prfop op = PLDL1KEEP) {                           \
+    wrap_label(L, op, &Assembler::NAME);                                \
+  }
+
+  INSN(prfm, 0b11, 0);
+
+#undef INSN
+
+  // Load/store
+  void ld_st1(int opc, int p1, int V, int L,
+              Register Rt1, Register Rt2, Address adr, bool no_allocate) {
+    starti;
+    f(opc, 31, 30), f(p1, 29, 27), f(V, 26), f(L, 22);
+    zrf(Rt2, 10), zrf(Rt1, 0);
+    if (no_allocate) {
+      adr.encode_nontemporal_pair(current);
+    } else {
+      adr.encode_pair(current);
+    }
+  }
+
+  // Load/store register pair (offset)
+#define INSN(NAME, size, p1, V, L, no_allocate)         \
+  void NAME(Register Rt1, Register Rt2, Address adr) {  \
+    ld_st1(size, p1, V, L, Rt1, Rt2, adr, no_allocate); \
+   }
+
+  INSN(stpw, 0b00, 0b101, 0, 0, false);
+  INSN(ldpw, 0b00, 0b101, 0, 1, false);
+  INSN(ldpsw, 0b01, 0b101, 0, 1, false);
+  INSN(stp, 0b10, 0b101, 0, 0, false);
+  INSN(ldp, 0b10, 0b101, 0, 1, false);
+
+  // Load/store no-allocate pair (offset)
+  INSN(stnpw, 0b00, 0b101, 0, 0, true);
+  INSN(ldnpw, 0b00, 0b101, 0, 1, true);
+  INSN(stnp, 0b10, 0b101, 0, 0, true);
+  INSN(ldnp, 0b10, 0b101, 0, 1, true);
+
+#undef INSN
+
+#define INSN(NAME, size, p1, V, L, no_allocate)                         \
+  void NAME(FloatRegister Rt1, FloatRegister Rt2, Address adr) {        \
+    ld_st1(size, p1, V, L, (Register)Rt1, (Register)Rt2, adr, no_allocate); \
+   }
+
+  INSN(stps, 0b00, 0b101, 1, 0, false);
+  INSN(ldps, 0b00, 0b101, 1, 1, false);
+  INSN(stpd, 0b01, 0b101, 1, 0, false);
+  INSN(ldpd, 0b01, 0b101, 1, 1, false);
+  INSN(stpq, 0b10, 0b101, 1, 0, false);
+  INSN(ldpq, 0b10, 0b101, 1, 1, false);
+
+#undef INSN
+
+  // Load/store register (all modes)
+  void ld_st2(Register Rt, const Address &adr, int size, int op, int V = 0) {
+    starti;
+
+    f(V, 26); // general reg?
+    zrf(Rt, 0);
+
+    // Encoding for literal loads is done here (rather than pushed
+    // down into Address::encode) because the encoding of this
+    // instruction is too different from all of the other forms to
+    // make it worth sharing.
+    if (adr.getMode() == Address::literal) {
+      assert(size == 0b10 || size == 0b11, "bad operand size in ldr");
+      assert(op == 0b01, "literal form can only be used with loads");
+      f(size & 0b01, 31, 30), f(0b011, 29, 27), f(0b00, 25, 24);
+      long offset = (adr.target() - pc()) >> 2;
+      sf(offset, 23, 5);
+      code_section()->relocate(pc(), adr.rspec());
+      return;
+    }
+
+    f(size, 31, 30);
+    f(op, 23, 22); // str
+    adr.encode(current);
+  }
+
+#define INSN(NAME, size, op)                            \
+  void NAME(Register Rt, const Address &adr) {          \
+    ld_st2(Rt, adr, size, op);                          \
+  }                                                     \
+
+  INSN(str, 0b11, 0b00);
+  INSN(strw, 0b10, 0b00);
+  INSN(strb, 0b00, 0b00);
+  INSN(strh, 0b01, 0b00);
+
+  INSN(ldr, 0b11, 0b01);
+  INSN(ldrw, 0b10, 0b01);
+  INSN(ldrb, 0b00, 0b01);
+  INSN(ldrh, 0b01, 0b01);
+
+  INSN(ldrsb, 0b00, 0b10);
+  INSN(ldrsbw, 0b00, 0b11);
+  INSN(ldrsh, 0b01, 0b10);
+  INSN(ldrshw, 0b01, 0b11);
+  INSN(ldrsw, 0b10, 0b10);
+
+#undef INSN
+
+#define INSN(NAME, size, op)                                    \
+  void NAME(const Address &adr, prfop pfop = PLDL1KEEP) {       \
+    ld_st2((Register)pfop, adr, size, op);                      \
+  }
+
+  INSN(prfm, 0b11, 0b10); // FIXME: PRFM should not be used with
+                          // writeback modes, but the assembler
+                          // doesn't enfore that.
+
+#undef INSN
+
+#define INSN(NAME, size, op)                            \
+  void NAME(FloatRegister Rt, const Address &adr) {     \
+    ld_st2((Register)Rt, adr, size, op, 1);             \
+  }
+
+  INSN(strd, 0b11, 0b00);
+  INSN(strs, 0b10, 0b00);
+  INSN(ldrd, 0b11, 0b01);
+  INSN(ldrs, 0b10, 0b01);
+  INSN(strq, 0b00, 0b10);
+  INSN(ldrq, 0x00, 0b11);
+
+#undef INSN
+
+  enum shift_kind { LSL, LSR, ASR, ROR };
+
+  void op_shifted_reg(unsigned decode,
+                      enum shift_kind kind, unsigned shift,
+                      unsigned size, unsigned op) {
+    f(size, 31);
+    f(op, 30, 29);
+    f(decode, 28, 24);
+    f(shift, 15, 10);
+    f(kind, 23, 22);
+  }
+
+  // Logical (shifted register)
+#define INSN(NAME, size, op, N)                                 \
+  void NAME(Register Rd, Register Rn, Register Rm,              \
+            enum shift_kind kind = LSL, unsigned shift = 0) {   \
+    starti;                                                     \
+    f(N, 21);                                                   \
+    zrf(Rm, 16), zrf(Rn, 5), zrf(Rd, 0);                        \
+    op_shifted_reg(0b01010, kind, shift, size, op);             \
+  }
+
+  INSN(andr, 1, 0b00, 0);
+  INSN(orr, 1, 0b01, 0);
+  INSN(eor, 1, 0b10, 0);
+  INSN(ands, 1, 0b11, 0);
+  INSN(andw, 0, 0b00, 0);
+  INSN(orrw, 0, 0b01, 0);
+  INSN(eorw, 0, 0b10, 0);
+  INSN(andsw, 0, 0b11, 0);
+
+  INSN(bic, 1, 0b00, 1);
+  INSN(orn, 1, 0b01, 1);
+  INSN(eon, 1, 0b10, 1);
+  INSN(bics, 1, 0b11, 1);
+  INSN(bicw, 0, 0b00, 1);
+  INSN(ornw, 0, 0b01, 1);
+  INSN(eonw, 0, 0b10, 1);
+  INSN(bicsw, 0, 0b11, 1);
+
+#undef INSN
+
+  // Add/subtract (shifted register)
+#define INSN(NAME, size, op)                            \
+  void NAME(Register Rd, Register Rn, Register Rm,      \
+            enum shift_kind kind, unsigned shift = 0) { \
+    starti;                                             \
+    f(0, 21);                                           \
+    assert_cond(kind != ROR);                           \
+    zrf(Rd, 0), zrf(Rn, 5), zrf(Rm, 16);                \
+    op_shifted_reg(0b01011, kind, shift, size, op);     \
+  }
+
+  INSN(add, 1, 0b000);
+  INSN(sub, 1, 0b10);
+  INSN(addw, 0, 0b000);
+  INSN(subw, 0, 0b10);
+
+  INSN(adds, 1, 0b001);
+  INSN(subs, 1, 0b11);
+  INSN(addsw, 0, 0b001);
+  INSN(subsw, 0, 0b11);
+
+#undef INSN
+
+  // Add/subtract (extended register)
+#define INSN(NAME, op)                                                  \
+  void NAME(Register Rd, Register Rn, Register Rm,                      \
+           ext::operation option, int amount = 0) {                     \
+    starti;                                                             \
+    zrf(Rm, 16), srf(Rn, 5), srf(Rd, 0);                                \
+    add_sub_extended_reg(op, 0b01011, Rd, Rn, Rm, 0b00, option, amount); \
+  }
+
+  void add_sub_extended_reg(unsigned op, unsigned decode,
+    Register Rd, Register Rn, Register Rm,
+    unsigned opt, ext::operation option, unsigned imm) {
+    guarantee(imm <= 4, "shift amount must be < 4");
+    f(op, 31, 29), f(decode, 28, 24), f(opt, 23, 22), f(1, 21);
+    f(option, 15, 13), f(imm, 12, 10);
+  }
+
+  INSN(addw, 0b000);
+  INSN(subw, 0b010);
+  INSN(add, 0b100);
+  INSN(sub, 0b110);
+
+#undef INSN
+
+#define INSN(NAME, op)                                                  \
+  void NAME(Register Rd, Register Rn, Register Rm,                      \
+           ext::operation option, int amount = 0) {                     \
+    starti;                                                             \
+    zrf(Rm, 16), srf(Rn, 5), zrf(Rd, 0);                                \
+    add_sub_extended_reg(op, 0b01011, Rd, Rn, Rm, 0b00, option, amount); \
+  }
+
+  INSN(addsw, 0b001);
+  INSN(subsw, 0b011);
+  INSN(adds, 0b101);
+  INSN(subs, 0b111);
+
+#undef INSN
+
+  // Aliases for short forms of add and sub
+#define INSN(NAME)                                      \
+  void NAME(Register Rd, Register Rn, Register Rm) {    \
+    if (Rd == sp || Rn == sp)                           \
+      NAME(Rd, Rn, Rm, ext::uxtx);                      \
+    else                                                \
+      NAME(Rd, Rn, Rm, LSL);                            \
+  }
+
+  INSN(addw);
+  INSN(subw);
+  INSN(add);
+  INSN(sub);
+
+  INSN(addsw);
+  INSN(subsw);
+  INSN(adds);
+  INSN(subs);
+
+#undef INSN
+
+  // Add/subtract (with carry)
+  void add_sub_carry(unsigned op, Register Rd, Register Rn, Register Rm) {
+    starti;
+    f(op, 31, 29);
+    f(0b11010000, 28, 21);
+    f(0b000000, 15, 10);
+    rf(Rm, 16), rf(Rn, 5), rf(Rd, 0);
+  }
+
+  #define INSN(NAME, op)                                \
+    void NAME(Register Rd, Register Rn, Register Rm) {  \
+      add_sub_carry(op, Rd, Rn, Rm);                    \
+    }
+
+  INSN(adcw, 0b000);
+  INSN(adcsw, 0b001);
+  INSN(sbcw, 0b010);
+  INSN(sbcsw, 0b011);
+  INSN(adc, 0b100);
+  INSN(adcs, 0b101);
+  INSN(sbc,0b110);
+  INSN(sbcs, 0b111);
+
+#undef INSN
+
+  // Conditional compare (both kinds)
+  void conditional_compare(unsigned op, int o2, int o3,
+                           Register Rn, unsigned imm5, unsigned nzcv,
+                           unsigned cond) {
+    f(op, 31, 29);
+    f(0b11010010, 28, 21);
+    f(cond, 15, 12);
+    f(o2, 10);
+    f(o3, 4);
+    f(nzcv, 3, 0);
+    f(imm5, 20, 16), rf(Rn, 5);
+  }
+
+#define INSN(NAME, op)                                                  \
+  void NAME(Register Rn, Register Rm, int imm, Condition cond) {        \
+    starti;                                                             \
+    f(0, 11);                                                           \
+    conditional_compare(op, 0, 0, Rn, (uintptr_t)Rm, imm, cond);        \
+  }                                                                     \
+                                                                        \
+  void NAME(Register Rn, int imm5, int imm, Condition cond) {   \
+    starti;                                                             \
+    f(1, 11);                                                           \
+    conditional_compare(op, 0, 0, Rn, imm5, imm, cond);                 \
+  }
+
+  INSN(ccmnw, 0b001);
+  INSN(ccmpw, 0b011);
+  INSN(ccmn, 0b101);
+  INSN(ccmp, 0b111);
+
+#undef INSN
+
+  // Conditional select
+  void conditional_select(unsigned op, unsigned op2,
+                          Register Rd, Register Rn, Register Rm,
+                          unsigned cond) {
+    starti;
+    f(op, 31, 29);
+    f(0b11010100, 28, 21);
+    f(cond, 15, 12);
+    f(op2, 11, 10);
+    zrf(Rm, 16), zrf(Rn, 5), rf(Rd, 0);
+  }
+
+#define INSN(NAME, op, op2)                                             \
+  void NAME(Register Rd, Register Rn, Register Rm, Condition cond) { \
+    conditional_select(op, op2, Rd, Rn, Rm, cond);                      \
+  }
+
+  INSN(cselw, 0b000, 0b00);
+  INSN(csincw, 0b000, 0b01);
+  INSN(csinvw, 0b010, 0b00);
+  INSN(csnegw, 0b010, 0b01);
+  INSN(csel, 0b100, 0b00);
+  INSN(csinc, 0b100, 0b01);
+  INSN(csinv, 0b110, 0b00);
+  INSN(csneg, 0b110, 0b01);
+
+#undef INSN
+
+  // Data processing
+  void data_processing(unsigned op29, unsigned opcode,
+                       Register Rd, Register Rn) {
+    f(op29, 31, 29), f(0b11010110, 28, 21);
+    f(opcode, 15, 10);
+    rf(Rn, 5), rf(Rd, 0);
+  }
+
+  // (1 source)
+#define INSN(NAME, op29, opcode2, opcode)       \
+  void NAME(Register Rd, Register Rn) {         \
+    starti;                                     \
+    f(opcode2, 20, 16);                         \
+    data_processing(op29, opcode, Rd, Rn);      \
+  }
+
+  INSN(rbitw,  0b010, 0b00000, 0b00000);
+  INSN(rev16w, 0b010, 0b00000, 0b00001);
+  INSN(revw,   0b010, 0b00000, 0b00010);
+  INSN(clzw,   0b010, 0b00000, 0b00100);
+  INSN(clsw,   0b010, 0b00000, 0b00101);
+
+  INSN(rbit,   0b110, 0b00000, 0b00000);
+  INSN(rev16,  0b110, 0b00000, 0b00001);
+  INSN(rev32,  0b110, 0b00000, 0b00010);
+  INSN(rev,    0b110, 0b00000, 0b00011);
+  INSN(clz,    0b110, 0b00000, 0b00100);
+  INSN(cls,    0b110, 0b00000, 0b00101);
+
+#undef INSN
+
+  // (2 sources)
+#define INSN(NAME, op29, opcode)                        \
+  void NAME(Register Rd, Register Rn, Register Rm) {    \
+    starti;                                             \
+    rf(Rm, 16);                                         \
+    data_processing(op29, opcode, Rd, Rn);              \
+  }
+
+  INSN(udivw, 0b000, 0b000010);
+  INSN(sdivw, 0b000, 0b000011);
+  INSN(lslvw, 0b000, 0b001000);
+  INSN(lsrvw, 0b000, 0b001001);
+  INSN(asrvw, 0b000, 0b001010);
+  INSN(rorvw, 0b000, 0b001011);
+
+  INSN(udiv, 0b100, 0b000010);
+  INSN(sdiv, 0b100, 0b000011);
+  INSN(lslv, 0b100, 0b001000);
+  INSN(lsrv, 0b100, 0b001001);
+  INSN(asrv, 0b100, 0b001010);
+  INSN(rorv, 0b100, 0b001011);
+
+#undef INSN
+
+  // (3 sources)
+  void data_processing(unsigned op54, unsigned op31, unsigned o0,
+                       Register Rd, Register Rn, Register Rm,
+                       Register Ra) {
+    starti;
+    f(op54, 31, 29), f(0b11011, 28, 24);
+    f(op31, 23, 21), f(o0, 15);
+    zrf(Rm, 16), zrf(Ra, 10), zrf(Rn, 5), zrf(Rd, 0);
+  }
+
+#define INSN(NAME, op54, op31, o0)                                      \
+  void NAME(Register Rd, Register Rn, Register Rm, Register Ra) {       \
+    data_processing(op54, op31, o0, Rd, Rn, Rm, Ra);                    \
+  }
+
+  INSN(maddw, 0b000, 0b000, 0);
+  INSN(msubw, 0b000, 0b000, 1);
+  INSN(madd, 0b100, 0b000, 0);
+  INSN(msub, 0b100, 0b000, 1);
+  INSN(smaddl, 0b100, 0b001, 0);
+  INSN(smsubl, 0b100, 0b001, 1);
+  INSN(umaddl, 0b100, 0b101, 0);
+  INSN(umsubl, 0b100, 0b101, 1);
+
+#undef INSN
+
+#define INSN(NAME, op54, op31, o0)                      \
+  void NAME(Register Rd, Register Rn, Register Rm) {    \
+    data_processing(op54, op31, o0, Rd, Rn, Rm, (Register)31);  \
+  }
+
+  INSN(smulh, 0b100, 0b010, 0);
+  INSN(umulh, 0b100, 0b110, 0);
+
+#undef INSN
+
+  // Floating-point data-processing (1 source)
+  void data_processing(unsigned op31, unsigned type, unsigned opcode,
+                       FloatRegister Vd, FloatRegister Vn) {
+    starti;
+    f(op31, 31, 29);
+    f(0b11110, 28, 24);
+    f(type, 23, 22), f(1, 21), f(opcode, 20, 15), f(0b10000, 14, 10);
+    rf(Vn, 5), rf(Vd, 0);
+  }
+
+#define INSN(NAME, op31, type, opcode)                  \
+  void NAME(FloatRegister Vd, FloatRegister Vn) {       \
+    data_processing(op31, type, opcode, Vd, Vn);        \
+  }
+
+private:
+  INSN(i_fmovs, 0b000, 0b00, 0b000000);
+public:
+  INSN(fabss, 0b000, 0b00, 0b000001);
+  INSN(fnegs, 0b000, 0b00, 0b000010);
+  INSN(fsqrts, 0b000, 0b00, 0b000011);
+  INSN(fcvts, 0b000, 0b00, 0b000101);   // Single-precision to double-precision
+
+private:
+  INSN(i_fmovd, 0b000, 0b01, 0b000000);
+public:
+  INSN(fabsd, 0b000, 0b01, 0b000001);
+  INSN(fnegd, 0b000, 0b01, 0b000010);
+  INSN(fsqrtd, 0b000, 0b01, 0b000011);
+  INSN(fcvtd, 0b000, 0b01, 0b000100);   // Double-precision to single-precision
+
+  void fmovd(FloatRegister Vd, FloatRegister Vn) {
+    assert(Vd != Vn, "should be");
+    i_fmovd(Vd, Vn);
+  }
+
+  void fmovs(FloatRegister Vd, FloatRegister Vn) {
+    assert(Vd != Vn, "should be");
+    i_fmovs(Vd, Vn);
+  }
+
+#undef INSN
+
+  // Floating-point data-processing (2 source)
+  void data_processing(unsigned op31, unsigned type, unsigned opcode,
+                       FloatRegister Vd, FloatRegister Vn, FloatRegister Vm) {
+    starti;
+    f(op31, 31, 29);
+    f(0b11110, 28, 24);
+    f(type, 23, 22), f(1, 21), f(opcode, 15, 12), f(0b10, 11, 10);
+    rf(Vm, 16), rf(Vn, 5), rf(Vd, 0);
+  }
+
+#define INSN(NAME, op31, type, opcode)                  \
+  void NAME(FloatRegister Vd, FloatRegister Vn, FloatRegister Vm) {     \
+    data_processing(op31, type, opcode, Vd, Vn, Vm);    \
+  }
+
+  INSN(fmuls, 0b000, 0b00, 0b0000);
+  INSN(fdivs, 0b000, 0b00, 0b0001);
+  INSN(fadds, 0b000, 0b00, 0b0010);
+  INSN(fsubs, 0b000, 0b00, 0b0011);
+  INSN(fnmuls, 0b000, 0b00, 0b1000);
+
+  INSN(fmuld, 0b000, 0b01, 0b0000);
+  INSN(fdivd, 0b000, 0b01, 0b0001);
+  INSN(faddd, 0b000, 0b01, 0b0010);
+  INSN(fsubd, 0b000, 0b01, 0b0011);
+  INSN(fnmuld, 0b000, 0b01, 0b1000);
+
+#undef INSN
+
+   // Floating-point data-processing (3 source)
+  void data_processing(unsigned op31, unsigned type, unsigned o1, unsigned o0,
+                       FloatRegister Vd, FloatRegister Vn, FloatRegister Vm,
+                       FloatRegister Va) {
+    starti;
+    f(op31, 31, 29);
+    f(0b11111, 28, 24);
+    f(type, 23, 22), f(o1, 21), f(o0, 15);
+    rf(Vm, 16), rf(Va, 10), rf(Vn, 5), rf(Vd, 0);
+  }
+
+#define INSN(NAME, op31, type, o1, o0)                                  \
+  void NAME(FloatRegister Vd, FloatRegister Vn, FloatRegister Vm,       \
+            FloatRegister Va) {                                         \
+    data_processing(op31, type, o1, o0, Vd, Vn, Vm, Va);                \
+  }
+
+  INSN(fmadds, 0b000, 0b00, 0, 0);
+  INSN(fmsubs, 0b000, 0b00, 0, 1);
+  INSN(fnmadds, 0b000, 0b00, 1, 0);
+  INSN(fnmsubs, 0b000, 0b00, 1, 1);
+
+  INSN(fmaddd, 0b000, 0b01, 0, 0);
+  INSN(fmsubd, 0b000, 0b01, 0, 1);
+  INSN(fnmaddd, 0b000, 0b01, 1, 0);
+  INSN(fnmsub, 0b000, 0b01, 1, 1);
+
+#undef INSN
+
+   // Floating-point conditional select
+  void fp_conditional_select(unsigned op31, unsigned type,
+                             unsigned op1, unsigned op2,
+                             Condition cond, FloatRegister Vd,
+                             FloatRegister Vn, FloatRegister Vm) {
+    starti;
+    f(op31, 31, 29);
+    f(0b11110, 28, 24);
+    f(type, 23, 22);
+    f(op1, 21, 21);
+    f(op2, 11, 10);
+    f(cond, 15, 12);
+    rf(Vm, 16), rf(Vn, 5), rf(Vd, 0);
+  }
+
+#define INSN(NAME, op31, type, op1, op2)                                \
+  void NAME(FloatRegister Vd, FloatRegister Vn,                         \
+            FloatRegister Vm, Condition cond) {                         \
+    fp_conditional_select(op31, type, op1, op2, cond, Vd, Vn, Vm);      \
+  }
+
+  INSN(fcsels, 0b000, 0b00, 0b1, 0b11);
+  INSN(fcseld, 0b000, 0b01, 0b1, 0b11);
+
+#undef INSN
+
+   // Floating-point<->integer conversions
+  void float_int_convert(unsigned op31, unsigned type,
+                         unsigned rmode, unsigned opcode,
+                         Register Rd, Register Rn) {
+    starti;
+    f(op31, 31, 29);
+    f(0b11110, 28, 24);
+    f(type, 23, 22), f(1, 21), f(rmode, 20, 19);
+    f(opcode, 18, 16), f(0b000000, 15, 10);
+    zrf(Rn, 5), zrf(Rd, 0);
+  }
+
+#define INSN(NAME, op31, type, rmode, opcode)                           \
+  void NAME(Register Rd, FloatRegister Vn) {                            \
+    float_int_convert(op31, type, rmode, opcode, Rd, (Register)Vn);     \
+  }
+
+  INSN(fcvtzsw, 0b000, 0b00, 0b11, 0b000);
+  INSN(fcvtzs,  0b100, 0b00, 0b11, 0b000);
+  INSN(fcvtzdw, 0b000, 0b01, 0b11, 0b000);
+  INSN(fcvtzd,  0b100, 0b01, 0b11, 0b000);
+
+  INSN(fmovs, 0b000, 0b00, 0b00, 0b110);
+  INSN(fmovd, 0b100, 0b01, 0b00, 0b110);
+
+  // INSN(fmovhid, 0b100, 0b10, 0b01, 0b110);
+
+#undef INSN
+
+#define INSN(NAME, op31, type, rmode, opcode)                           \
+  void NAME(FloatRegister Vd, Register Rn) {                            \
+    float_int_convert(op31, type, rmode, opcode, (Register)Vd, Rn);     \
+  }
+
+  INSN(fmovs, 0b000, 0b00, 0b00, 0b111);
+  INSN(fmovd, 0b100, 0b01, 0b00, 0b111);
+
+  INSN(scvtfws, 0b000, 0b00, 0b00, 0b010);
+  INSN(scvtfs,  0b100, 0b00, 0b00, 0b010);
+  INSN(scvtfwd, 0b000, 0b01, 0b00, 0b010);
+  INSN(scvtfd,  0b100, 0b01, 0b00, 0b010);
+
+  // INSN(fmovhid, 0b100, 0b10, 0b01, 0b111);
+
+#undef INSN
+
+  // Floating-point compare
+  void float_compare(unsigned op31, unsigned type,
+                     unsigned op, unsigned op2,
+                     FloatRegister Vn, FloatRegister Vm = (FloatRegister)0) {
+    starti;
+    f(op31, 31, 29);
+    f(0b11110, 28, 24);
+    f(type, 23, 22), f(1, 21);
+    f(op, 15, 14), f(0b1000, 13, 10), f(op2, 4, 0);
+    rf(Vn, 5), rf(Vm, 16);
+  }
+
+
+#define INSN(NAME, op31, type, op, op2)                 \
+  void NAME(FloatRegister Vn, FloatRegister Vm) {       \
+    float_compare(op31, type, op, op2, Vn, Vm);         \
+  }
+
+#define INSN1(NAME, op31, type, op, op2)        \
+  void NAME(FloatRegister Vn, double d) {       \
+    assert_cond(d == 0.0);                      \
+    float_compare(op31, type, op, op2, Vn);     \
+  }
+
+  INSN(fcmps, 0b000, 0b00, 0b00, 0b00000);
+  INSN1(fcmps, 0b000, 0b00, 0b00, 0b01000);
+  // INSN(fcmpes, 0b000, 0b00, 0b00, 0b10000);
+  // INSN1(fcmpes, 0b000, 0b00, 0b00, 0b11000);
+
+  INSN(fcmpd, 0b000,   0b01, 0b00, 0b00000);
+  INSN1(fcmpd, 0b000,  0b01, 0b00, 0b01000);
+  // INSN(fcmped, 0b000,  0b01, 0b00, 0b10000);
+  // INSN1(fcmped, 0b000, 0b01, 0b00, 0b11000);
+
+#undef INSN
+#undef INSN1
+
+  // Floating-point Move (immediate)
+private:
+  unsigned pack(double value);
+
+  void fmov_imm(FloatRegister Vn, double value, unsigned size) {
+    starti;
+    f(0b00011110, 31, 24), f(size, 23, 22), f(1, 21);
+    f(pack(value), 20, 13), f(0b10000000, 12, 5);
+    rf(Vn, 0);
+  }
+
+public:
+
+  void fmovs(FloatRegister Vn, double value) {
+    if (value)
+      fmov_imm(Vn, value, 0b00);
+    else
+      fmovs(Vn, zr);
+  }
+  void fmovd(FloatRegister Vn, double value) {
+    if (value)
+      fmov_imm(Vn, value, 0b01);
+    else
+      fmovd(Vn, zr);
+  }
+
+/* SIMD extensions
+ *
+ * We just use FloatRegister in the following. They are exactly the same
+ * as SIMD registers.
+ */
+ public:
+
+  enum SIMD_Arrangement {
+       T8B, T16B, T4H, T8H, T2S, T4S, T1D, T2D
+  };
+
+  enum SIMD_RegVariant {
+       S32, D64, Q128
+  };
+
+
+ private:
+
+  void ld_st(FloatRegister Vt, SIMD_Arrangement T, Register Xn, int op1, int op2) {
+    starti;
+    f(0,31), f((int)T & 1, 30);
+    f(op1, 29, 21), f(0, 20, 16), f(op2, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void ld_st(FloatRegister Vt, SIMD_Arrangement T, Register Xn,
+             int imm, int op1, int op2) {
+    starti;
+    f(0,31), f((int)T & 1, 30);
+    f(op1 | 0b100, 29, 21), f(0b11111, 20, 16), f(op2, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+  void ld_st(FloatRegister Vt, SIMD_Arrangement T, Register Xn,
+             Register Xm, int op1, int op2) {
+    starti;
+    f(0,31), f((int)T & 1, 30);
+    f(op1 | 0b100, 29, 21), rf(Xm, 16), f(op2, 15, 12);
+    f((int)T >> 1, 11, 10), rf(Xn, 5), rf(Vt, 0);
+  }
+
+ void ld_st(FloatRegister Vt, SIMD_Arrangement T, Address a, int op1, int op2) {
+   switch (a.getMode()) {
+   case Address::base_plus_offset:
+     guarantee(a.offset() == 0, "no offset allowed here");
+     ld_st(Vt, T, a.base(), op1, op2);
+     break;
+   case Address::post:
+     ld_st(Vt, T, a.base(), a.offset(), op1, op2);
+     break;
+   case Address::base_plus_offset_reg:
+     ld_st(Vt, T, a.base(), a.index(), op1, op2);
+     break;
+   default:
+     ShouldNotReachHere();
+   }
+ }
+
+ public:
+
+#define INSN1(NAME, op1, op2)                                   \
+  void NAME(FloatRegister Vt, SIMD_Arrangement T, const Address &a) {   \
+   ld_st(Vt, T, a, op1, op2);                                           \
+ }
+
+#define INSN2(NAME, op1, op2)                                           \
+  void NAME(FloatRegister Vt, FloatRegister Vt2, SIMD_Arrangement T, const Address &a) { \
+    assert(Vt->successor() == Vt2, "Registers must be ordered");        \
+    ld_st(Vt, T, a, op1, op2);                                          \
+  }
+
+#define INSN3(NAME, op1, op2)                                           \
+  void NAME(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3,     \
+            SIMD_Arrangement T, const Address &a) {                     \
+    assert(Vt->successor() == Vt2 && Vt2->successor() == Vt3,           \
+           "Registers must be ordered");                                \
+    ld_st(Vt, T, a, op1, op2);                                          \
+  }
+
+#define INSN4(NAME, op1, op2)                                           \
+  void NAME(FloatRegister Vt, FloatRegister Vt2, FloatRegister Vt3,     \
+            FloatRegister Vt4, SIMD_Arrangement T, const Address &a) {  \
+    assert(Vt->successor() == Vt2 && Vt2->successor() == Vt3 &&         \
+           Vt3->successor() == Vt4, "Registers must be ordered");       \
+    ld_st(Vt, T, a, op1, op2);                                          \
+  }
+
+  INSN1(ld1,  0b001100010, 0b0111);
+  INSN2(ld1,  0b001100010, 0b1010);
+  INSN3(ld1,  0b001100010, 0b0110);
+  INSN4(ld1,  0b001100010, 0b0010);
+
+  INSN2(ld2,  0b001100010, 0b1000);
+  INSN3(ld3,  0b001100010, 0b0100);
+  INSN4(ld4,  0b001100010, 0b0000);
+
+  INSN1(st1,  0b001100000, 0b0111);
+  INSN2(st1,  0b001100000, 0b1010);
+  INSN3(st1,  0b001100000, 0b0110);
+  INSN4(st1,  0b001100000, 0b0010);
+
+  INSN2(st2,  0b001100000, 0b1000);
+  INSN3(st3,  0b001100000, 0b0100);
+  INSN4(st4,  0b001100000, 0b0000);
+
+  INSN1(ld1r, 0b001101010, 0b1100);
+  INSN2(ld2r, 0b001101011, 0b1100);
+  INSN3(ld3r, 0b001101010, 0b1110);
+  INSN4(ld4r, 0b001101011, 0b1110);
+
+#undef INSN1
+#undef INSN2
+#undef INSN3
+#undef INSN4
+
+#define INSN(NAME, opc)                                                                 \
+  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) { \
+    starti;                                                                             \
+    assert(T == T8B || T == T16B, "must be T8B or T16B");                               \
+    f(0, 31), f((int)T & 1, 30), f(opc, 29, 21);                                        \
+    rf(Vm, 16), f(0b000111, 15, 10), rf(Vn, 5), rf(Vd, 0);                              \
+  }
+
+  INSN(eor, 0b101110001);
+  INSN(orr, 0b001110101);
+  INSN(andr, 0b001110001);
+  INSN(bic, 0b001110011);
+  INSN(bif, 0b101110111);
+  INSN(bit, 0b101110101);
+  INSN(bsl, 0b101110011);
+  INSN(orn, 0b001110111);
+
+#undef INSN
+
+#define INSN(NAME, opc)                                                                 \
+  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) { \
+    starti;                                                                             \
+    f(0, 31), f((int)T & 1, 30), f(opc, 29), f(0b01110, 28, 24);                        \
+    f((int)T >> 1, 23, 22), f(1, 21), rf(Vm, 16), f(0b100001, 15, 10);                  \
+    rf(Vn, 5), rf(Vd, 0);                                                               \
+  }
+
+  INSN(addv, 0);
+  INSN(subv, 1);
+
+#undef INSN
+
+#define INSN(NAME, opc)                                                                 \
+  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm) { \
+    starti;                                                                             \
+    assert(T == T4S, "arrangement must be T4S");                                        \
+    f(0b01011110000, 31, 21), rf(Vm, 16), f(opc, 15, 10), rf(Vn, 5), rf(Vd, 0);         \
+  }
+
+  INSN(sha1c,     0b000000);
+  INSN(sha1m,     0b001000);
+  INSN(sha1p,     0b000100);
+  INSN(sha1su0,   0b001100);
+  INSN(sha256h2,  0b010100);
+  INSN(sha256h,   0b010000);
+  INSN(sha256su1, 0b011000);
+
+#undef INSN
+
+#define INSN(NAME, opc)                                                                 \
+  void NAME(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) {                   \
+    starti;                                                                             \
+    assert(T == T4S, "arrangement must be T4S");                                        \
+    f(0b0101111000101000, 31, 16), f(opc, 15, 10), rf(Vn, 5), rf(Vd, 0);                \
+  }
+
+  INSN(sha1h,     0b000010);
+  INSN(sha1su1,   0b000110);
+  INSN(sha256su0, 0b001010);
+
+#undef INSN
+
+#define INSN(NAME, opc)                           \
+  void NAME(FloatRegister Vd, FloatRegister Vn) { \
+    starti;                                       \
+    f(opc, 31, 10), rf(Vn, 5), rf(Vd, 0);         \
+  }
+
+  INSN(aese, 0b0100111000101000010010);
+  INSN(aesd, 0b0100111000101000010110);
+  INSN(aesmc, 0b0100111000101000011010);
+  INSN(aesimc, 0b0100111000101000011110);
+
+#undef INSN
+
+  void shl(FloatRegister Vd, FloatRegister Vn, SIMD_Arrangement T, int shift){
+    starti;
+    /* The encodings for the immh:immb fields (bits 22:16) are
+     *   0001 xxx       8B/16B, shift = xxx
+     *   001x xxx       4H/8H,  shift = xxxx
+     *   01xx xxx       2S/4S,  shift = xxxxx
+     *   1xxx xxx       1D/2D,  shift = xxxxxx (1D is RESERVED)
+     */
+    assert((1 << ((T>>1)+3)) > shift, "Invalid Shift value");
+    f(0, 31), f(T & 1, 30), f(0b0011110, 29, 23), f((1 << ((T>>1)+3))|shift, 22, 16);
+    f(0b010101, 15, 10), rf(Vn, 5), rf(Vd, 0);
+  }
+
+  void ushll(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, SIMD_Arrangement Tb, int shift) {
+    starti;
+    /* The encodings for the immh:immb fields (bits 22:16) are
+     *   0001 xxx       8H, 8B/16b shift = xxx
+     *   001x xxx       4S, 4H/8H  shift = xxxx
+     *   01xx xxx       2D, 2S/4S  shift = xxxxx
+     *   1xxx xxx       RESERVED
+     */
+    assert((Tb >> 1) + 1 == (Ta >> 1), "Incompatible arrangement");
+    assert((1 << ((Tb>>1)+3)) > shift, "Invalid shift value");
+    f(0, 31), f(Tb & 1, 30), f(0b1011110, 29, 23), f((1 << ((Tb>>1)+3))|shift, 22, 16);
+    f(0b101001, 15, 10), rf(Vn, 5), rf(Vd, 0);
+  }
+  void ushll2(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn,  SIMD_Arrangement Tb, int shift) {
+    ushll(Vd, Ta, Vn, Tb, shift);
+  }
+
+  void uzp1(FloatRegister Vd, FloatRegister Vn, FloatRegister Vm,  SIMD_Arrangement T, int op = 0){
+    starti;
+    f(0, 31), f((T & 0x1), 30), f(0b001110, 29, 24), f((T >> 1), 23, 22), f(0, 21);
+    rf(Vm, 16), f(0, 15), f(op, 14), f(0b0110, 13, 10), rf(Vn, 5), rf(Vd, 0);
+  }
+  void uzp2(FloatRegister Vd, FloatRegister Vn, FloatRegister Vm,  SIMD_Arrangement T){
+    uzp1(Vd, Vn, Vm, T, 1);
+  }
+
+  // Move from general purpose register
+  //   mov  Vd.T[index], Rn
+  void mov(FloatRegister Vd, SIMD_Arrangement T, int index, Register Xn) {
+    starti;
+    f(0b01001110000, 31, 21), f(((1 << (T >> 1)) | (index << ((T >> 1) + 1))), 20, 16);
+    f(0b000111, 15, 10), rf(Xn, 5), rf(Vd, 0);
+  }
+
+  // Move to general purpose register
+  //   mov  Rd, Vn.T[index]
+  void mov(Register Xd, FloatRegister Vn, SIMD_Arrangement T, int index) {
+    starti;
+    f(0, 31), f((T >= T1D) ? 1:0, 30), f(0b001110000, 29, 21);
+    f(((1 << (T >> 1)) | (index << ((T >> 1) + 1))), 20, 16);
+    f(0b001111, 15, 10), rf(Vn, 5), rf(Xd, 0);
+  }
+
+  // We do not handle the 1Q arrangement.
+  void pmull(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) {
+    starti;
+    assert(Ta == T8H && (Tb == T8B || Tb == T16B), "Invalid Size specifier");
+    f(0, 31), f(Tb & 1, 30), f(0b001110001, 29, 21), rf(Vm, 16), f(0b111000, 15, 10);
+    rf(Vn, 5), rf(Vd, 0);
+  }
+  void pmull2(FloatRegister Vd, SIMD_Arrangement Ta, FloatRegister Vn, FloatRegister Vm, SIMD_Arrangement Tb) {
+    pmull(Vd, Ta, Vn, Vm, Tb);
+  }
+
+  void uqxtn(FloatRegister Vd, SIMD_Arrangement Tb, FloatRegister Vn, SIMD_Arrangement Ta) {
+    starti;
+    int size_b = (int)Tb >> 1;
+    int size_a = (int)Ta >> 1;
+    assert(size_b < 3 && size_b == size_a - 1, "Invalid size specifier");
+    f(0, 31), f(Tb & 1, 30), f(0b101110, 29, 24), f(size_b, 23, 22);
+    f(0b100001010010, 21, 10), rf(Vn, 5), rf(Vd, 0);
+  }
+
+  void rev32(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn)
+  {
+    starti;
+    assert(T <= T8H, "must be one of T8B, T16B, T4H, T8H");
+    f(0, 31), f((int)T & 1, 30), f(0b101110, 29, 24);
+    f(T <= T16B ? 0b00 : 0b01, 23, 22), f(0b100000000010, 21, 10);
+    rf(Vn, 5), rf(Vd, 0);
+  }
+
+  // CRC32 instructions
+#define INSN(NAME, sf, sz)                                                \
+  void NAME(Register Rd, Register Rn, Register Rm) {                      \
+    starti;                                                               \
+    f(sf, 31), f(0b0011010110, 30, 21), f(0b0100, 15, 12), f(sz, 11, 10); \
+    rf(Rm, 16), rf(Rn, 5), rf(Rd, 0);                                     \
+  }
+
+  INSN(crc32b, 0, 0b00);
+  INSN(crc32h, 0, 0b01);
+  INSN(crc32w, 0, 0b10);
+  INSN(crc32x, 1, 0b11);
+
+#undef INSN
+
+
+/* Simulator extensions to the ISA
+
+   haltsim
+
+   takes no arguments, causes the sim to enter a debug break and then
+   return from the simulator run() call with STATUS_HALT? The linking
+   code will call fatal() when it sees STATUS_HALT.
+
+   blrt Xn, Wm
+   blrt Xn, #gpargs, #fpargs, #type
+   Xn holds the 64 bit x86 branch_address
+   call format is encoded either as immediate data in the call
+   or in register Wm. In the latter case
+     Wm[13..6] = #gpargs,
+     Wm[5..2] = #fpargs,
+     Wm[1,0] = #type
+
+   calls the x86 code address 'branch_address' supplied in Xn passing
+   arguments taken from the general and floating point registers according
+   to the supplied counts 'gpargs' and 'fpargs'. may return a result in r0
+   or v0 according to the the return type #type' where
+
+   address branch_address;
+   uimm4 gpargs;
+   uimm4 fpargs;
+   enum ReturnType type;
+
+   enum ReturnType
+     {
+       void_ret = 0,
+       int_ret = 1,
+       long_ret = 1,
+       obj_ret = 1, // i.e. same as long
+       float_ret = 2,
+       double_ret = 3
+     }
+
+   notify
+
+   notifies the simulator of a transfer of control. instr[14:0]
+   identifies the type of change of control.
+
+   0 ==> initial entry to a method.
+
+   1 ==> return into a method from a submethod call.
+
+   2 ==> exit out of Java method code.
+
+   3 ==> start execution for a new bytecode.
+
+   in cases 1 and 2 the simulator is expected to use a JVM callback to
+   identify the name of the specific method being executed. in case 4
+   the simulator is expected to use a JVM callback to identify the
+   bytecode index.
+
+   Instruction encodings
+   ---------------------
+
+   These are encoded in the space with instr[28:25] = 00 which is
+   unallocated. Encodings are
+
+                     10987654321098765432109876543210
+   PSEUDO_HALT   = 0x11100000000000000000000000000000
+   PSEUDO_BLRT  = 0x11000000000000000_______________
+   PSEUDO_BLRTR = 0x1100000000000000100000__________
+   PSEUDO_NOTIFY = 0x10100000000000000_______________
+
+   instr[31,29] = op1 : 111 ==> HALT, 110 ==> BLRT/BLRTR, 101 ==> NOTIFY
+
+   for BLRT
+     instr[14,11] = #gpargs, instr[10,7] = #fpargs
+     instr[6,5] = #type, instr[4,0] = Rn
+   for BLRTR
+     instr[9,5] = Rm, instr[4,0] = Rn
+   for NOTIFY
+     instr[14:0] = type : 0 ==> entry, 1 ==> reentry, 2 ==> exit, 3 ==> bcstart
+*/
+
+  enum NotifyType { method_entry, method_reentry, method_exit, bytecode_start };
+
+  virtual void notify(int type) {
+    if (UseBuiltinSim) {
+      starti;
+      //  109
+      f(0b101, 31, 29);
+      //  87654321098765
+      f(0b00000000000000, 28, 15);
+      f(type, 14, 0);
+    }
+  }
+
+  void blrt(Register Rn, int gpargs, int fpargs, int type) {
+    if (UseBuiltinSim) {
+      starti;
+      f(0b110, 31 ,29);
+      f(0b00, 28, 25);
+      //  4321098765
+      f(0b0000000000, 24, 15);
+      f(gpargs, 14, 11);
+      f(fpargs, 10, 7);
+      f(type, 6, 5);
+      rf(Rn, 0);
+    } else {
+      blr(Rn);
+    }
+  }
+
+  void blrt(Register Rn, Register Rm) {
+    if (UseBuiltinSim) {
+      starti;
+      f(0b110, 31 ,29);
+      f(0b00, 28, 25);
+      //  4321098765
+      f(0b0000000001, 24, 15);
+      //  43210
+      f(0b00000, 14, 10);
+      rf(Rm, 5);
+      rf(Rn, 0);
+    } else {
+      blr(Rn);
+    }
+  }
+
+  void haltsim() {
+    starti;
+    f(0b111, 31 ,29);
+    f(0b00, 28, 27);
+    //  654321098765432109876543210
+    f(0b000000000000000000000000000, 26, 0);
+  }
+
+  Assembler(CodeBuffer* code) : AbstractAssembler(code) {
+  }
+
+  virtual RegisterOrConstant delayed_value_impl(intptr_t* delayed_value_addr,
+                                                Register tmp,
+                                                int offset) {
+    ShouldNotCallThis();
+    return RegisterOrConstant();
+  }
+
+  // Stack overflow checking
+  virtual void bang_stack_with_offset(int offset);
+
+  static bool operand_valid_for_logical_immediate(bool is32, uint64_t imm);
+  static bool operand_valid_for_add_sub_immediate(long imm);
+  static bool operand_valid_for_float_immediate(double imm);
+
+  void emit_data64(jlong data, relocInfo::relocType rtype, int format = 0);
+  void emit_data64(jlong data, RelocationHolder const& rspec, int format = 0);
+};
+
+inline Assembler::Membar_mask_bits operator|(Assembler::Membar_mask_bits a,
+                                             Assembler::Membar_mask_bits b) {
+  return Assembler::Membar_mask_bits(unsigned(a)|unsigned(b));
+}
+
+Instruction_aarch64::~Instruction_aarch64() {
+  assem->emit();
+}
+
+#undef starti
+
+// Invert a condition
+inline const Assembler::Condition operator~(const Assembler::Condition cond) {
+  return Assembler::Condition(int(cond) ^ 1);
+}
+
+class BiasedLockingCounters;
+
+extern "C" void das(uint64_t start, int len);
+
+#endif // CPU_AARCH64_VM_ASSEMBLER_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/assembler_aarch64.inline.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_ASSEMBLER_AARCH64_INLINE_HPP
+#define CPU_AARCH64_VM_ASSEMBLER_AARCH64_INLINE_HPP
+
+#include "asm/assembler.inline.hpp"
+#include "asm/codeBuffer.hpp"
+#include "code/codeCache.hpp"
+
+#endif // CPU_AARCH64_VM_ASSEMBLER_AARCH64_INLINE_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/bytecodeInterpreter_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "interpreter/bytecodeInterpreter.hpp"
+#include "interpreter/bytecodeInterpreter.inline.hpp"
+#include "interpreter/interpreter.hpp"
+#include "interpreter/interpreterRuntime.hpp"
+#include "oops/methodData.hpp"
+#include "oops/method.hpp"
+#include "oops/oop.inline.hpp"
+#include "prims/jvmtiExport.hpp"
+#include "prims/jvmtiThreadState.hpp"
+#include "runtime/deoptimization.hpp"
+#include "runtime/frame.inline.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "runtime/synchronizer.hpp"
+#include "runtime/vframeArray.hpp"
+#include "utilities/debug.hpp"
+#include "interp_masm_aarch64.hpp"
+
+#ifdef CC_INTERP
+
+#endif // CC_INTERP (all)

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/bytecodeInterpreter_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_BYTECODEINTERPRETER_AARCH64_HPP
+#define CPU_AARCH64_VM_BYTECODEINTERPRETER_AARCH64_HPP
+
+// Platform specific for C++ based Interpreter
+
+private:
+
+    interpreterState _self_link;          /*  Previous interpreter state  */ /* sometimes points to self??? */
+    address   _result_handler;            /* temp for saving native result handler */
+    intptr_t* _sender_sp;                 /* sender's sp before stack (locals) extension */
+
+    address   _extra_junk1;               /* temp to save on recompiles */
+    address   _extra_junk2;               /* temp to save on recompiles */
+    address   _extra_junk3;               /* temp to save on recompiles */
+    // address dummy_for_native2;         /* a native frame result handler would be here... */
+    // address dummy_for_native1;         /* native result type stored here in a interpreter native frame */
+    address   _extra_junk4;               /* temp to save on recompiles */
+    address   _extra_junk5;               /* temp to save on recompiles */
+    address   _extra_junk6;               /* temp to save on recompiles */
+public:
+                                                         // we have an interpreter frame...
+inline intptr_t* sender_sp() {
+  return _sender_sp;
+}
+
+// The interpreter always has the frame anchor fully setup so we don't
+// have to do anything going to vm from the interpreter. On return
+// we do have to clear the flags in case they we're modified to
+// maintain the stack walking invariants.
+//
+#define SET_LAST_JAVA_FRAME()
+
+#define RESET_LAST_JAVA_FRAME()
+
+/*
+ * Macros for accessing the stack.
+ */
+#undef STACK_INT
+#undef STACK_FLOAT
+#undef STACK_ADDR
+#undef STACK_OBJECT
+#undef STACK_DOUBLE
+#undef STACK_LONG
+
+// JavaStack Implementation
+
+#define GET_STACK_SLOT(offset)    (*((intptr_t*) &topOfStack[-(offset)]))
+#define STACK_SLOT(offset)    ((address) &topOfStack[-(offset)])
+#define STACK_ADDR(offset)    (*((address *) &topOfStack[-(offset)]))
+#define STACK_INT(offset)     (*((jint*) &topOfStack[-(offset)]))
+#define STACK_FLOAT(offset)   (*((jfloat *) &topOfStack[-(offset)]))
+#define STACK_OBJECT(offset)  (*((oop *) &topOfStack [-(offset)]))
+#define STACK_DOUBLE(offset)  (((VMJavaVal64*) &topOfStack[-(offset)])->d)
+#define STACK_LONG(offset)    (((VMJavaVal64 *) &topOfStack[-(offset)])->l)
+
+#define SET_STACK_SLOT(value, offset)   (*(intptr_t*)&topOfStack[-(offset)] = *(intptr_t*)(value))
+#define SET_STACK_ADDR(value, offset)   (*((address *)&topOfStack[-(offset)]) = (value))
+#define SET_STACK_INT(value, offset)    (*((jint *)&topOfStack[-(offset)]) = (value))
+#define SET_STACK_FLOAT(value, offset)  (*((jfloat *)&topOfStack[-(offset)]) = (value))
+#define SET_STACK_OBJECT(value, offset) (*((oop *)&topOfStack[-(offset)]) = (value))
+#define SET_STACK_DOUBLE(value, offset) (((VMJavaVal64*)&topOfStack[-(offset)])->d = (value))
+#define SET_STACK_DOUBLE_FROM_ADDR(addr, offset) (((VMJavaVal64*)&topOfStack[-(offset)])->d =  \
+                                                 ((VMJavaVal64*)(addr))->d)
+#define SET_STACK_LONG(value, offset)   (((VMJavaVal64*)&topOfStack[-(offset)])->l = (value))
+#define SET_STACK_LONG_FROM_ADDR(addr, offset)   (((VMJavaVal64*)&topOfStack[-(offset)])->l =  \
+                                                 ((VMJavaVal64*)(addr))->l)
+// JavaLocals implementation
+
+#define LOCALS_SLOT(offset)    ((intptr_t*)&locals[-(offset)])
+#define LOCALS_ADDR(offset)    ((address)locals[-(offset)])
+#define LOCALS_INT(offset)     ((jint)(locals[-(offset)]))
+#define LOCALS_FLOAT(offset)   (*((jfloat*)&locals[-(offset)]))
+#define LOCALS_OBJECT(offset)  ((oop)locals[-(offset)])
+#define LOCALS_DOUBLE(offset)  (((VMJavaVal64*)&locals[-((offset) + 1)])->d)
+#define LOCALS_LONG(offset)    (((VMJavaVal64*)&locals[-((offset) + 1)])->l)
+#define LOCALS_LONG_AT(offset) (((address)&locals[-((offset) + 1)]))
+#define LOCALS_DOUBLE_AT(offset) (((address)&locals[-((offset) + 1)]))
+
+#define SET_LOCALS_SLOT(value, offset)    (*(intptr_t*)&locals[-(offset)] = *(intptr_t *)(value))
+#define SET_LOCALS_ADDR(value, offset)    (*((address *)&locals[-(offset)]) = (value))
+#define SET_LOCALS_INT(value, offset)     (*((jint *)&locals[-(offset)]) = (value))
+#define SET_LOCALS_FLOAT(value, offset)   (*((jfloat *)&locals[-(offset)]) = (value))
+#define SET_LOCALS_OBJECT(value, offset)  (*((oop *)&locals[-(offset)]) = (value))
+#define SET_LOCALS_DOUBLE(value, offset)  (((VMJavaVal64*)&locals[-((offset)+1)])->d = (value))
+#define SET_LOCALS_LONG(value, offset)    (((VMJavaVal64*)&locals[-((offset)+1)])->l = (value))
+#define SET_LOCALS_DOUBLE_FROM_ADDR(addr, offset) (((VMJavaVal64*)&locals[-((offset)+1)])->d = \
+                                                  ((VMJavaVal64*)(addr))->d)
+#define SET_LOCALS_LONG_FROM_ADDR(addr, offset) (((VMJavaVal64*)&locals[-((offset)+1)])->l = \
+                                                ((VMJavaVal64*)(addr))->l)
+
+#endif // CPU_AARCH64_VM_BYTECODEINTERPRETER_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/bytecodeInterpreter_aarch64.inline.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_BYTECODEINTERPRETER_AARCH64_INLINE_HPP
+#define CPU_AARCH64_VM_BYTECODEINTERPRETER_AARCH64_INLINE_HPP
+
+// Inline interpreter functions for IA32
+
+inline jfloat BytecodeInterpreter::VMfloatAdd(jfloat op1, jfloat op2) { return op1 + op2; }
+inline jfloat BytecodeInterpreter::VMfloatSub(jfloat op1, jfloat op2) { return op1 - op2; }
+inline jfloat BytecodeInterpreter::VMfloatMul(jfloat op1, jfloat op2) { return op1 * op2; }
+inline jfloat BytecodeInterpreter::VMfloatDiv(jfloat op1, jfloat op2) { return op1 / op2; }
+inline jfloat BytecodeInterpreter::VMfloatRem(jfloat op1, jfloat op2) { return fmod(op1, op2); }
+
+inline jfloat BytecodeInterpreter::VMfloatNeg(jfloat op) { return -op; }
+
+inline int32_t BytecodeInterpreter::VMfloatCompare(jfloat op1, jfloat op2, int32_t direction) {
+  return ( op1 < op2 ? -1 :
+               op1 > op2 ? 1 :
+                   op1 == op2 ? 0 :
+                       (direction == -1 || direction == 1) ? direction : 0);
+
+}
+
+inline void BytecodeInterpreter::VMmemCopy64(uint32_t to[2], const uint32_t from[2]) {
+  // x86 can do unaligned copies but not 64bits at a time
+  to[0] = from[0]; to[1] = from[1];
+}
+
+// The long operations depend on compiler support for "long long" on x86
+
+inline jlong BytecodeInterpreter::VMlongAdd(jlong op1, jlong op2) {
+  return op1 + op2;
+}
+
+inline jlong BytecodeInterpreter::VMlongAnd(jlong op1, jlong op2) {
+  return op1 & op2;
+}
+
+inline jlong BytecodeInterpreter::VMlongDiv(jlong op1, jlong op2) {
+  // QQQ what about check and throw...
+  return op1 / op2;
+}
+
+inline jlong BytecodeInterpreter::VMlongMul(jlong op1, jlong op2) {
+  return op1 * op2;
+}
+
+inline jlong BytecodeInterpreter::VMlongOr(jlong op1, jlong op2) {
+  return op1 | op2;
+}
+
+inline jlong BytecodeInterpreter::VMlongSub(jlong op1, jlong op2) {
+  return op1 - op2;
+}
+
+inline jlong BytecodeInterpreter::VMlongXor(jlong op1, jlong op2) {
+  return op1 ^ op2;
+}
+
+inline jlong BytecodeInterpreter::VMlongRem(jlong op1, jlong op2) {
+  return op1 % op2;
+}
+
+inline jlong BytecodeInterpreter::VMlongUshr(jlong op1, jint op2) {
+  // CVM did this 0x3f mask, is the really needed??? QQQ
+  return ((unsigned long long) op1) >> (op2 & 0x3F);
+}
+
+inline jlong BytecodeInterpreter::VMlongShr(jlong op1, jint op2) {
+  return op1 >> (op2 & 0x3F);
+}
+
+inline jlong BytecodeInterpreter::VMlongShl(jlong op1, jint op2) {
+  return op1 << (op2 & 0x3F);
+}
+
+inline jlong BytecodeInterpreter::VMlongNeg(jlong op) {
+  return -op;
+}
+
+inline jlong BytecodeInterpreter::VMlongNot(jlong op) {
+  return ~op;
+}
+
+inline int32_t BytecodeInterpreter::VMlongLtz(jlong op) {
+  return (op <= 0);
+}
+
+inline int32_t BytecodeInterpreter::VMlongGez(jlong op) {
+  return (op >= 0);
+}
+
+inline int32_t BytecodeInterpreter::VMlongEqz(jlong op) {
+  return (op == 0);
+}
+
+inline int32_t BytecodeInterpreter::VMlongEq(jlong op1, jlong op2) {
+  return (op1 == op2);
+}
+
+inline int32_t BytecodeInterpreter::VMlongNe(jlong op1, jlong op2) {
+  return (op1 != op2);
+}
+
+inline int32_t BytecodeInterpreter::VMlongGe(jlong op1, jlong op2) {
+  return (op1 >= op2);
+}
+
+inline int32_t BytecodeInterpreter::VMlongLe(jlong op1, jlong op2) {
+  return (op1 <= op2);
+}
+
+inline int32_t BytecodeInterpreter::VMlongLt(jlong op1, jlong op2) {
+  return (op1 < op2);
+}
+
+inline int32_t BytecodeInterpreter::VMlongGt(jlong op1, jlong op2) {
+  return (op1 > op2);
+}
+
+inline int32_t BytecodeInterpreter::VMlongCompare(jlong op1, jlong op2) {
+  return (VMlongLt(op1, op2) ? -1 : VMlongGt(op1, op2) ? 1 : 0);
+}
+
+// Long conversions
+
+inline jdouble BytecodeInterpreter::VMlong2Double(jlong val) {
+  return (jdouble) val;
+}
+
+inline jfloat BytecodeInterpreter::VMlong2Float(jlong val) {
+  return (jfloat) val;
+}
+
+inline jint BytecodeInterpreter::VMlong2Int(jlong val) {
+  return (jint) val;
+}
+
+// Double Arithmetic
+
+inline jdouble BytecodeInterpreter::VMdoubleAdd(jdouble op1, jdouble op2) {
+  return op1 + op2;
+}
+
+inline jdouble BytecodeInterpreter::VMdoubleDiv(jdouble op1, jdouble op2) {
+  // Divide by zero... QQQ
+  return op1 / op2;
+}
+
+inline jdouble BytecodeInterpreter::VMdoubleMul(jdouble op1, jdouble op2) {
+  return op1 * op2;
+}
+
+inline jdouble BytecodeInterpreter::VMdoubleNeg(jdouble op) {
+  return -op;
+}
+
+inline jdouble BytecodeInterpreter::VMdoubleRem(jdouble op1, jdouble op2) {
+  return fmod(op1, op2);
+}
+
+inline jdouble BytecodeInterpreter::VMdoubleSub(jdouble op1, jdouble op2) {
+  return op1 - op2;
+}
+
+inline int32_t BytecodeInterpreter::VMdoubleCompare(jdouble op1, jdouble op2, int32_t direction) {
+  return ( op1 < op2 ? -1 :
+               op1 > op2 ? 1 :
+                   op1 == op2 ? 0 :
+                       (direction == -1 || direction == 1) ? direction : 0);
+}
+
+// Double Conversions
+
+inline jfloat BytecodeInterpreter::VMdouble2Float(jdouble val) {
+  return (jfloat) val;
+}
+
+// Float Conversions
+
+inline jdouble BytecodeInterpreter::VMfloat2Double(jfloat op) {
+  return (jdouble) op;
+}
+
+// Integer Arithmetic
+
+inline jint BytecodeInterpreter::VMintAdd(jint op1, jint op2) {
+  return op1 + op2;
+}
+
+inline jint BytecodeInterpreter::VMintAnd(jint op1, jint op2) {
+  return op1 & op2;
+}
+
+inline jint BytecodeInterpreter::VMintDiv(jint op1, jint op2) {
+  /* it's possible we could catch this special case implicitly */
+  if ((juint)op1 == 0x80000000 && op2 == -1) return op1;
+  else return op1 / op2;
+}
+
+inline jint BytecodeInterpreter::VMintMul(jint op1, jint op2) {
+  return op1 * op2;
+}
+
+inline jint BytecodeInterpreter::VMintNeg(jint op) {
+  return -op;
+}
+
+inline jint BytecodeInterpreter::VMintOr(jint op1, jint op2) {
+  return op1 | op2;
+}
+
+inline jint BytecodeInterpreter::VMintRem(jint op1, jint op2) {
+  /* it's possible we could catch this special case implicitly */
+  if ((juint)op1 == 0x80000000 && op2 == -1) return 0;
+  else return op1 % op2;
+}
+
+inline jint BytecodeInterpreter::VMintShl(jint op1, jint op2) {
+  return op1 << op2;
+}
+
+inline jint BytecodeInterpreter::VMintShr(jint op1, jint op2) {
+  return op1 >> (op2 & 0x1f);
+}
+
+inline jint BytecodeInterpreter::VMintSub(jint op1, jint op2) {
+  return op1 - op2;
+}
+
+inline jint BytecodeInterpreter::VMintUshr(jint op1, jint op2) {
+  return ((juint) op1) >> (op2 & 0x1f);
+}
+
+inline jint BytecodeInterpreter::VMintXor(jint op1, jint op2) {
+  return op1 ^ op2;
+}
+
+inline jdouble BytecodeInterpreter::VMint2Double(jint val) {
+  return (jdouble) val;
+}
+
+inline jfloat BytecodeInterpreter::VMint2Float(jint val) {
+  return (jfloat) val;
+}
+
+inline jlong BytecodeInterpreter::VMint2Long(jint val) {
+  return (jlong) val;
+}
+
+inline jchar BytecodeInterpreter::VMint2Char(jint val) {
+  return (jchar) val;
+}
+
+inline jshort BytecodeInterpreter::VMint2Short(jint val) {
+  return (jshort) val;
+}
+
+inline jbyte BytecodeInterpreter::VMint2Byte(jint val) {
+  return (jbyte) val;
+}
+
+#endif // CPU_AARCH64_VM_BYTECODEINTERPRETER_AARCH64_INLINE_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/bytecodes_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "interpreter/bytecodes.hpp"
+
+

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/bytecodes_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_BYTECODES_AARCH64_HPP
+#define CPU_AARCH64_VM_BYTECODES_AARCH64_HPP
+
+// No aarch64 specific bytecodes
+
+#endif // CPU_AARCH64_VM_BYTECODES_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/bytes_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_BYTES_AARCH64_HPP
+#define CPU_AARCH64_VM_BYTES_AARCH64_HPP
+
+#include "memory/allocation.hpp"
+
+class Bytes: AllStatic {
+ public:
+  // Returns true if the byte ordering used by Java is different from the native byte ordering
+  // of the underlying machine. For example, this is true for Intel x86, but false for Solaris
+  // on Sparc.
+  static inline bool is_Java_byte_ordering_different(){ return true; }
+
+
+  // Efficient reading and writing of unaligned unsigned data in platform-specific byte ordering
+  // (no special code is needed since x86 CPUs can access unaligned data)
+  static inline u2   get_native_u2(address p)         { return *(u2*)p; }
+  static inline u4   get_native_u4(address p)         { return *(u4*)p; }
+  static inline u8   get_native_u8(address p)         { return *(u8*)p; }
+
+  static inline void put_native_u2(address p, u2 x)   { *(u2*)p = x; }
+  static inline void put_native_u4(address p, u4 x)   { *(u4*)p = x; }
+  static inline void put_native_u8(address p, u8 x)   { *(u8*)p = x; }
+
+
+  // Efficient reading and writing of unaligned unsigned data in Java
+  // byte ordering (i.e. big-endian ordering). Byte-order reversal is
+  // needed since x86 CPUs use little-endian format.
+  static inline u2   get_Java_u2(address p)           { return swap_u2(get_native_u2(p)); }
+  static inline u4   get_Java_u4(address p)           { return swap_u4(get_native_u4(p)); }
+  static inline u8   get_Java_u8(address p)           { return swap_u8(get_native_u8(p)); }
+
+  static inline void put_Java_u2(address p, u2 x)     { put_native_u2(p, swap_u2(x)); }
+  static inline void put_Java_u4(address p, u4 x)     { put_native_u4(p, swap_u4(x)); }
+  static inline void put_Java_u8(address p, u8 x)     { put_native_u8(p, swap_u8(x)); }
+
+
+  // Efficient swapping of byte ordering
+  static inline u2   swap_u2(u2 x);                   // compiler-dependent implementation
+  static inline u4   swap_u4(u4 x);                   // compiler-dependent implementation
+  static inline u8   swap_u8(u8 x);
+};
+
+
+// The following header contains the implementations of swap_u2, swap_u4, and swap_u8[_base]
+
+#ifdef TARGET_OS_ARCH_linux_aarch64
+# include "bytes_linux_aarch64.inline.hpp"
+#endif
+
+#endif // CPU_AARCH64_VM_BYTES_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/codeBuffer_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_CODEBUFFER_AARCH64_HPP
+#define CPU_AARCH64_VM_CODEBUFFER_AARCH64_HPP
+
+private:
+  void pd_initialize() {}
+
+public:
+  void flush_bundle(bool start_new_bundle) {}
+
+#endif // CPU_AARCH64_VM_CODEBUFFER_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/compiledIC_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "code/compiledIC.hpp"
+#include "code/icBuffer.hpp"
+#include "code/nmethod.hpp"
+#include "memory/resourceArea.hpp"
+#include "runtime/mutexLocker.hpp"
+#include "runtime/safepoint.hpp"
+
+// Release the CompiledICHolder* associated with this call site is there is one.
+void CompiledIC::cleanup_call_site(virtual_call_Relocation* call_site) {
+  // This call site might have become stale so inspect it carefully.
+  NativeCall* call = nativeCall_at(call_site->addr());
+  if (is_icholder_entry(call->destination())) {
+    NativeMovConstReg* value = nativeMovConstReg_at(call_site->cached_value());
+    InlineCacheBuffer::queue_for_release((CompiledICHolder*)value->data());
+  }
+}
+
+bool CompiledIC::is_icholder_call_site(virtual_call_Relocation* call_site) {
+  // This call site might have become stale so inspect it carefully.
+  NativeCall* call = nativeCall_at(call_site->addr());
+  return is_icholder_entry(call->destination());
+}
+
+// ----------------------------------------------------------------------------
+
+#define __ _masm.
+void CompiledStaticCall::emit_to_interp_stub(CodeBuffer &cbuf) {
+  // Stub is fixed up when the corresponding call is converted from
+  // calling compiled code to calling interpreted code.
+  // mov rmethod, 0
+  // jmp -4 # to self
+
+  address mark = cbuf.insts_mark();  // Get mark within main instrs section.
+
+  // Note that the code buffer's insts_mark is always relative to insts.
+  // That's why we must use the macroassembler to generate a stub.
+  MacroAssembler _masm(&cbuf);
+
+  address base = __ start_a_stub(to_interp_stub_size()*2);
+
+  int offset = __ offset();
+  if (base == NULL)  return;  // CodeBuffer::expand failed
+  // static stub relocation stores the instruction address of the call
+  __ relocate(static_stub_Relocation::spec(mark));
+  // static stub relocation also tags the Method* in the code-stream.
+  __ mov_metadata(rmethod, (Metadata*)NULL);
+  __ movptr(rscratch1, 0);
+  __ br(rscratch1);
+
+  assert((__ offset() - offset) <= (int)to_interp_stub_size(), "stub too big");
+  __ end_a_stub();
+}
+#undef __
+
+int CompiledStaticCall::to_interp_stub_size() {
+  return 7 * NativeInstruction::instruction_size;
+}
+
+// Relocation entries for call stub, compiled java to interpreter.
+int CompiledStaticCall::reloc_to_interp_stub() {
+  return 4; // 3 in emit_to_interp_stub + 1 in emit_call
+}
+
+void CompiledStaticCall::set_to_interpreted(methodHandle callee, address entry) {
+  address stub = find_stub();
+  guarantee(stub != NULL, "stub not found");
+
+  if (TraceICs) {
+    ResourceMark rm;
+    tty->print_cr("CompiledStaticCall@" INTPTR_FORMAT ": set_to_interpreted %s",
+                  p2i(instruction_address()),
+                  callee->name_and_sig_as_C_string());
+  }
+
+  // Creation also verifies the object.
+  NativeMovConstReg* method_holder = nativeMovConstReg_at(stub);
+#ifndef PRODUCT
+  NativeGeneralJump* jump = nativeGeneralJump_at(method_holder->next_instruction_address());
+
+  assert(method_holder->data() == 0 || method_holder->data() == (intptr_t)callee(),
+         "a) MT-unsafe modification of inline cache");
+  assert(method_holder->data() == 0 || jump->jump_destination() == entry,
+         "b) MT-unsafe modification of inline cache");
+#endif
+  // Update stub.
+  method_holder->set_data((intptr_t)callee());
+  NativeGeneralJump::insert_unconditional(method_holder->next_instruction_address(), entry);
+  ICache::invalidate_range(stub, to_interp_stub_size());
+  // Update jump to call.
+  set_destination_mt_safe(stub);
+}
+
+void CompiledStaticCall::set_stub_to_clean(static_stub_Relocation* static_stub) {
+  assert (CompiledIC_lock->is_locked() || SafepointSynchronize::is_at_safepoint(), "mt unsafe call");
+  // Reset stub.
+  address stub = static_stub->addr();
+  assert(stub != NULL, "stub not found");
+  // Creation also verifies the object.
+  NativeMovConstReg* method_holder = nativeMovConstReg_at(stub);
+  method_holder->set_data(0);
+}
+
+//-----------------------------------------------------------------------------
+// Non-product mode code
+#ifndef PRODUCT
+
+void CompiledStaticCall::verify() {
+  // Verify call.
+  NativeCall::verify();
+  if (os::is_MP()) {
+    verify_alignment();
+  }
+
+  // Verify stub.
+  address stub = find_stub();
+  assert(stub != NULL, "no stub found for static call");
+  // Creation also verifies the object.
+  NativeMovConstReg* method_holder = nativeMovConstReg_at(stub);
+  NativeJump*        jump          = nativeJump_at(method_holder->next_instruction_address());
+
+  // Verify state.
+  assert(is_clean() || is_call_to_compiled() || is_call_to_interpreted(), "sanity check");
+}
+
+#endif // !PRODUCT

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/copy_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_COPY_AARCH64_HPP
+#define CPU_AARCH64_VM_COPY_AARCH64_HPP
+
+// Inline functions for memory copy and fill.
+
+// Contains inline asm implementations
+#ifdef TARGET_OS_ARCH_linux_aarch64
+# include "copy_linux_aarch64.inline.hpp"
+#endif
+
+
+static void pd_fill_to_words(HeapWord* tohw, size_t count, juint value) {
+  julong* to = (julong*) tohw;
+  julong  v  = ((julong) value << 32) | value;
+  while (count-- > 0) {
+    *to++ = v;
+  }
+}
+
+static void pd_fill_to_aligned_words(HeapWord* tohw, size_t count, juint value) {
+  pd_fill_to_words(tohw, count, value);
+}
+
+static void pd_fill_to_bytes(void* to, size_t count, jubyte value) {
+  (void)memset(to, value, count);
+}
+
+static void pd_zero_to_words(HeapWord* tohw, size_t count) {
+  pd_fill_to_words(tohw, count, 0);
+}
+
+static void pd_zero_to_bytes(void* to, size_t count) {
+  (void)memset(to, 0, count);
+}
+
+#endif // CPU_AARCH64_VM_COPY_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/cppInterpreterGenerator_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_CPPINTERPRETERGENERATOR_AARCH64_HPP
+#define CPU_AARCH64_VM_CPPINTERPRETERGENERATOR_AARCH64_HPP
+
+ protected:
+
+  void generate_more_monitors();
+  void generate_deopt_handling();
+
+#endif // CPU_AARCH64_VM_CPPINTERPRETERGENERATOR_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/cpustate_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,595 @@
+/*
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef _CPU_STATE_H
+#define _CPU_STATE_H
+
+#include <sys/types.h>
+
+/*
+ * symbolic names used to identify general registers which also match
+ * the registers indices in machine code
+ *
+ * We have 32 general registers which can be read/written as 32 bit or
+ * 64 bit sources/sinks and are appropriately referred to as Wn or Xn
+ * in the assembly code.  Some instructions mix these access modes
+ * (e.g. ADD X0, X1, W2) so the implementation of the instruction
+ * needs to *know* which type of read or write access is required.
+ */
+enum GReg {
+  R0,
+  R1,
+  R2,
+  R3,
+  R4,
+  R5,
+  R6,
+  R7,
+  R8,
+  R9,
+  R10,
+  R11,
+  R12,
+  R13,
+  R14,
+  R15,
+  R16,
+  R17,
+  R18,
+  R19,
+  R20,
+  R21,
+  R22,
+  R23,
+  R24,
+  R25,
+  R26,
+  R27,
+  R28,
+  R29,
+  R30,
+  R31,
+  // and now the aliases
+  RSCRATCH1=R8,
+  RSCRATCH2=R9,
+  RMETHOD=R12,
+  RESP=R20,
+  RDISPATCH=R21,
+  RBCP=R22,
+  RLOCALS=R24,
+  RMONITORS=R25,
+  RCPOOL=R26,
+  RHEAPBASE=R27,
+  RTHREAD=R28,
+  FP = R29,
+  LR = R30,
+  SP = R31,
+  ZR = R31
+};
+
+/*
+ * symbolic names used to refer to floating point registers which also
+ * match the registers indices in machine code
+ *
+ * We have 32 FP registers which can be read/written as 8, 16, 32, 64
+ * and 128 bit sources/sinks and are appropriately referred to as Bn,
+ * Hn, Sn, Dn and Qn in the assembly code. Some instructions mix these
+ * access modes (e.g. FCVT S0, D0) so the implementation of the
+ * instruction needs to *know* which type of read or write access is
+ * required.
+ */
+
+enum VReg {
+  V0,
+  V1,
+  V2,
+  V3,
+  V4,
+  V5,
+  V6,
+  V7,
+  V8,
+  V9,
+  V10,
+  V11,
+  V12,
+  V13,
+  V14,
+  V15,
+  V16,
+  V17,
+  V18,
+  V19,
+  V20,
+  V21,
+  V22,
+  V23,
+  V24,
+  V25,
+  V26,
+  V27,
+  V28,
+  V29,
+  V30,
+  V31,
+};
+
+/**
+ * all the different integer bit patterns for the components of a
+ * general register are overlaid here using a union so as to allow all
+ * reading and writing of the desired bits.
+ *
+ * n.b. the ARM spec says that when you write a 32 bit register you
+ * are supposed to write the low 32 bits and zero the high 32
+ * bits. But we don't actually have to care about this because Java
+ * will only ever consume the 32 bits value as a 64 bit quantity after
+ * an explicit extend.
+ */
+union GRegisterValue
+{
+  int8_t s8;
+  int16_t s16;
+  int32_t s32;
+  int64_t s64;
+  u_int8_t u8;
+  u_int16_t u16;
+  u_int32_t u32;
+  u_int64_t u64;
+};
+
+class GRegister
+{
+public:
+  GRegisterValue value;
+};
+
+/*
+ * float registers provide for storage of a single, double or quad
+ * word format float in the same register. single floats are not
+ * paired within each double register as per 32 bit arm. instead each
+ * 128 bit register Vn embeds the bits for Sn, and Dn in the lower
+ * quarter and half, respectively, of the bits for Qn.
+ *
+ * The upper bits can also be accessed as single or double floats by
+ * the float vector operations using indexing e.g. V1.D[1], V1.S[3]
+ * etc and, for SIMD operations using a horrible index range notation.
+ *
+ * The spec also talks about accessing float registers as half words
+ * and bytes with Hn and Bn providing access to the low 16 and 8 bits
+ * of Vn but it is not really clear what these bits represent. We can
+ * probably ignore this for Java anyway. However, we do need to access
+ * the raw bits at 32 and 64 bit resolution to load to/from integer
+ * registers.
+ */
+
+union FRegisterValue
+{
+  float s;
+  double d;
+  long double q;
+  // eventually we will need to be able to access the data as a vector
+  // the integral array elements allow us to access the bits in s, d,
+  // q, vs and vd at an appropriate level of granularity
+  u_int8_t vb[16];
+  u_int16_t vh[8];
+  u_int32_t vw[4];
+  u_int64_t vx[2];
+  float vs[4];
+  double vd[2];
+};
+
+class FRegister
+{
+public:
+  FRegisterValue value;
+};
+
+/*
+ * CPSR register -- this does not exist as a directly accessible
+ * register but we need to store the flags so we can implement
+ * flag-seting and flag testing operations
+ *
+ * we can possibly use injected x86 asm to report the outcome of flag
+ * setting operations. if so we will need to grab the flags
+ * immediately after the operation in order to ensure we don't lose
+ * them because of the actions of the simulator. so we still need
+ * somewhere to store the condition codes.
+ */
+
+class CPSRRegister
+{
+public:
+  u_int32_t value;
+
+/*
+ * condition register bit select values
+ *
+ * the order of bits here is important because some of
+ * the flag setting conditional instructions employ a
+ * bit field to populate the flags when a false condition
+ * bypasses execution of the operation and we want to
+ * be able to assign the flags register using the
+ * supplied value.
+ */
+
+  enum CPSRIdx {
+    V_IDX,
+    C_IDX,
+    Z_IDX,
+    N_IDX
+  };
+
+  enum CPSRMask {
+    V = 1 << V_IDX,
+    C = 1 << C_IDX,
+    Z = 1 << Z_IDX,
+    N = 1 << N_IDX
+  };
+
+  static const int CPSR_ALL_FLAGS = (V | C | Z | N);
+};
+
+// auxiliary function to assemble the relevant bits from
+// the x86 EFLAGS register into an ARM CPSR value
+
+#define X86_V_IDX 11
+#define X86_C_IDX 0
+#define X86_Z_IDX 6
+#define X86_N_IDX 7
+
+#define X86_V (1 << X86_V_IDX)
+#define X86_C (1 << X86_C_IDX)
+#define X86_Z (1 << X86_Z_IDX)
+#define X86_N (1 << X86_N_IDX)
+
+inline u_int32_t convertX86Flags(u_int32_t x86flags)
+{
+  u_int32_t flags;
+  // set N flag
+  flags = ((x86flags & X86_N) >> X86_N_IDX);
+  // shift then or in Z flag
+  flags <<= 1;
+  flags |= ((x86flags & X86_Z) >> X86_Z_IDX);
+  // shift then or in C flag
+  flags <<= 1;
+  flags |= ((x86flags & X86_C) >> X86_C_IDX);
+  // shift then or in V flag
+  flags <<= 1;
+  flags |= ((x86flags & X86_V) >> X86_V_IDX);
+
+  return flags;
+}
+
+inline u_int32_t convertX86FlagsFP(u_int32_t x86flags)
+{
+  // x86 flags set by fcomi(x,y) are ZF:PF:CF
+  // (yes, that's PF for parity, WTF?)
+  // where
+  // 0) 0:0:0 means x > y
+  // 1) 0:0:1 means x < y
+  // 2) 1:0:0 means x = y
+  // 3) 1:1:1 means x and y are unordered
+  // note that we don't have to check PF so
+  // we really have a simple 2-bit case switch
+  // the corresponding ARM64 flags settings
+  //  in hi->lo bit order are
+  // 0) --C-
+  // 1) N---
+  // 2) -ZC-
+  // 3) --CV
+
+  static u_int32_t armFlags[] = {
+      0b0010,
+      0b1000,
+      0b0110,
+      0b0011
+  };
+  // pick out the ZF and CF bits
+  u_int32_t zc = ((x86flags & X86_Z) >> X86_Z_IDX);
+  zc <<= 1;
+  zc |= ((x86flags & X86_C) >> X86_C_IDX);
+
+  return armFlags[zc];
+}
+
+/*
+ * FPSR register -- floating point status register
+
+ * this register includes IDC, IXC, UFC, OFC, DZC, IOC and QC bits,
+ * and the floating point N, Z, C, V bits but the latter are unused in
+ * aarch64 mode. the sim ignores QC for now.
+ *
+ * bit positions are as per the ARMv7 FPSCR register
+ *
+ * IDC :  7 ==> Input Denormal (cumulative exception bit)
+ * IXC :  4 ==> Inexact
+ * UFC :  3 ==> Underflow
+ * OFC :  2 ==> Overflow
+ * DZC :  1 ==> Division by Zero
+ * IOC :  0 ==> Invalid Operation
+ */
+
+class FPSRRegister
+{
+public:
+  u_int32_t value;
+  // indices for bits in the FPSR register value
+  enum FPSRIdx {
+    IO_IDX = 0,
+    DZ_IDX = 1,
+    OF_IDX = 2,
+    UF_IDX = 3,
+    IX_IDX = 4,
+    ID_IDX = 7
+  };
+  // corresponding bits as numeric values
+  enum FPSRMask {
+    IO = (1 << IO_IDX),
+    DZ = (1 << DZ_IDX),
+    OF = (1 << OF_IDX),
+    UF = (1 << UF_IDX),
+    IX = (1 << IX_IDX),
+    ID = (1 << ID_IDX)
+  };
+  static const int FPSR_ALL_FPSRS = (IO | DZ | OF | UF | IX | ID);
+};
+
+// debugger support
+
+enum PrintFormat
+{
+  FMT_DECIMAL,
+  FMT_HEX,
+  FMT_SINGLE,
+  FMT_DOUBLE,
+  FMT_QUAD,
+  FMT_MULTI
+};
+
+/*
+ * model of the registers and other state associated with the cpu
+ */
+class CPUState
+{
+  friend class AArch64Simulator;
+private:
+  // this is the PC of the instruction being executed
+  u_int64_t pc;
+  // this is the PC of the instruction to be executed next
+  // it is defaulted to pc + 4 at instruction decode but
+  // execute may reset it
+
+  u_int64_t nextpc;
+  GRegister gr[33];             // extra register at index 32 is used
+                                // to hold zero value
+  FRegister fr[32];
+  CPSRRegister cpsr;
+  FPSRRegister fpsr;
+
+public:
+
+  CPUState() {
+    gr[20].value.u64 = 0;  // establish initial condition for
+                           // checkAssertions()
+    trace_counter = 0;
+  }
+
+  // General Register access macros
+
+  // only xreg or xregs can be used as an lvalue in order to update a
+  // register. this ensures that the top part of a register is always
+  // assigned when it is written by the sim.
+
+  inline u_int64_t &xreg(GReg reg, int r31_is_sp) {
+    if (reg == R31 && !r31_is_sp) {
+      return gr[32].value.u64;
+    } else {
+      return gr[reg].value.u64;
+    }
+  }
+
+  inline int64_t &xregs(GReg reg, int r31_is_sp) {
+    if (reg == R31 && !r31_is_sp) {
+      return gr[32].value.s64;
+    } else {
+      return gr[reg].value.s64;
+    }
+  }
+
+  inline u_int32_t wreg(GReg reg, int r31_is_sp) {
+    if (reg == R31 && !r31_is_sp) {
+      return gr[32].value.u32;
+    } else {
+      return gr[reg].value.u32;
+    }
+  }
+
+  inline int32_t wregs(GReg reg, int r31_is_sp) {
+    if (reg == R31 && !r31_is_sp) {
+      return gr[32].value.s32;
+    } else {
+      return gr[reg].value.s32;
+    }
+  }
+
+  inline u_int32_t hreg(GReg reg, int r31_is_sp) {
+    if (reg == R31 && !r31_is_sp) {
+      return gr[32].value.u16;
+    } else {
+      return gr[reg].value.u16;
+    }
+  }
+
+  inline int32_t hregs(GReg reg, int r31_is_sp) {
+    if (reg == R31 && !r31_is_sp) {
+      return gr[32].value.s16;
+    } else {
+      return gr[reg].value.s16;
+    }
+  }
+
+  inline u_int32_t breg(GReg reg, int r31_is_sp) {
+    if (reg == R31 && !r31_is_sp) {
+      return gr[32].value.u8;
+    } else {
+      return gr[reg].value.u8;
+    }
+  }
+
+  inline int32_t bregs(GReg reg, int r31_is_sp) {
+    if (reg == R31 && !r31_is_sp) {
+      return gr[32].value.s8;
+    } else {
+      return gr[reg].value.s8;
+    }
+  }
+
+  // FP Register access macros
+
+  // all non-vector accessors return a reference so we can both read
+  // and assign
+
+  inline float &sreg(VReg reg) {
+    return fr[reg].value.s;
+  }
+
+  inline double &dreg(VReg reg) {
+    return fr[reg].value.d;
+  }
+
+  inline long double &qreg(VReg reg) {
+    return fr[reg].value.q;
+  }
+
+  // all vector register accessors return a pointer
+
+  inline float *vsreg(VReg reg) {
+    return &fr[reg].value.vs[0];
+  }
+
+  inline double *vdreg(VReg reg) {
+    return &fr[reg].value.vd[0];
+  }
+
+  inline u_int8_t *vbreg(VReg reg) {
+    return &fr[reg].value.vb[0];
+  }
+
+  inline u_int16_t *vhreg(VReg reg) {
+    return &fr[reg].value.vh[0];
+  }
+
+  inline u_int32_t *vwreg(VReg reg) {
+    return &fr[reg].value.vw[0];
+  }
+
+  inline u_int64_t *vxreg(VReg reg) {
+    return &fr[reg].value.vx[0];
+  }
+
+  union GRegisterValue prev_sp, prev_fp;
+
+  static const int trace_size = 256;
+  u_int64_t trace_buffer[trace_size];
+  int trace_counter;
+
+  bool checkAssertions()
+  {
+    // Make sure that SP is 16-aligned
+    // Also make sure that ESP is above SP.
+    // We don't care about checking ESP if it is null, i.e. it hasn't
+    // been used yet.
+    if (gr[31].value.u64 & 0x0f) {
+      asm volatile("nop");
+      return false;
+    }
+    return true;
+  }
+
+  // pc register accessors
+
+  // this instruction can be used to fetch the current PC
+  u_int64_t getPC();
+  // instead of setting the current PC directly you can
+  // first set the next PC (either absolute or PC-relative)
+  // and later copy the next PC into the current PC
+  // this supports a default increment by 4 at instruction
+  // fetch with an optional reset by control instructions
+  u_int64_t getNextPC();
+  void setNextPC(u_int64_t next);
+  void offsetNextPC(int64_t offset);
+  // install nextpc as current pc
+  void updatePC();
+
+  // this instruction can be used to save the next PC to LR
+  // just before installing a branch PC
+  inline void saveLR() { gr[LR].value.u64 = nextpc; }
+
+  // cpsr register accessors
+  u_int32_t getCPSRRegister();
+  void setCPSRRegister(u_int32_t flags);
+  // read a specific subset of the flags as a bit pattern
+  // mask should be composed using elements of enum FlagMask
+  u_int32_t getCPSRBits(u_int32_t mask);
+  // assign a specific subset of the flags as a bit pattern
+  // mask and value should be composed using elements of enum FlagMask
+  void setCPSRBits(u_int32_t mask, u_int32_t value);
+  // test the value of a single flag returned as 1 or 0
+  u_int32_t testCPSR(CPSRRegister::CPSRIdx idx);
+  // set a single flag
+  void setCPSR(CPSRRegister::CPSRIdx idx);
+  // clear a single flag
+  void clearCPSR(CPSRRegister::CPSRIdx idx);
+  // utility method to set ARM CSPR flags from an x86 bit mask generated by integer arithmetic
+  void setCPSRRegisterFromX86(u_int64_t x86Flags);
+  // utility method to set ARM CSPR flags from an x86 bit mask generated by floating compare
+  void setCPSRRegisterFromX86FP(u_int64_t x86Flags);
+
+  // fpsr register accessors
+  u_int32_t getFPSRRegister();
+  void setFPSRRegister(u_int32_t flags);
+  // read a specific subset of the fprs bits as a bit pattern
+  // mask should be composed using elements of enum FPSRRegister::FlagMask
+  u_int32_t getFPSRBits(u_int32_t mask);
+  // assign a specific subset of the flags as a bit pattern
+  // mask and value should be composed using elements of enum FPSRRegister::FlagMask
+  void setFPSRBits(u_int32_t mask, u_int32_t value);
+  // test the value of a single flag returned as 1 or 0
+  u_int32_t testFPSR(FPSRRegister::FPSRIdx idx);
+  // set a single flag
+  void setFPSR(FPSRRegister::FPSRIdx idx);
+  // clear a single flag
+  void clearFPSR(FPSRRegister::FPSRIdx idx);
+
+  // debugger support
+  void printPC(int pending, const char *trailing = "\n");
+  void printInstr(u_int32_t instr, void (*dasm)(u_int64_t), const char *trailing = "\n");
+  void printGReg(GReg reg, PrintFormat format = FMT_HEX, const char *trailing = "\n");
+  void printVReg(VReg reg, PrintFormat format = FMT_HEX, const char *trailing = "\n");
+  void printCPSR(const char *trailing = "\n");
+  void printFPSR(const char *trailing = "\n");
+  void dumpState();
+};
+
+#endif // ifndef _CPU_STATE_H

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/debug_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "code/codeCache.hpp"
+#include "code/nmethod.hpp"
+#include "runtime/frame.hpp"
+#include "runtime/init.hpp"
+#include "runtime/os.hpp"
+#include "utilities/debug.hpp"
+#include "utilities/top.hpp"
+
+void pd_ps(frame f) {}

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/decode_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,412 @@
+/*
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef _DECODE_H
+#define _DECODE_H
+
+#include <sys/types.h>
+#include "cpustate_aarch64.hpp"
+
+// bitfield immediate expansion helper
+
+extern int expandLogicalImmediate(u_int32_t immN, u_int32_t immr,
+                                    u_int32_t imms, u_int64_t &bimm);
+
+
+/*
+ * codes used in conditional instructions
+ *
+ * these are passed to conditional operations to identify which
+ * condition to test for
+ */
+enum CondCode {
+  EQ = 0b0000, // meaning Z == 1
+  NE = 0b0001, // meaning Z == 0
+  HS = 0b0010, // meaning C == 1
+  CS = HS,
+  LO = 0b0011, // meaning C == 0
+  CC = LO,
+  MI = 0b0100, // meaning N == 1
+  PL = 0b0101, // meaning N == 0
+  VS = 0b0110, // meaning V == 1
+  VC = 0b0111, // meaning V == 0
+  HI = 0b1000, // meaning C == 1 && Z == 0
+  LS = 0b1001, // meaning !(C == 1 && Z == 0)
+  GE = 0b1010, // meaning N == V
+  LT = 0b1011, // meaning N != V
+  GT = 0b1100, // meaning Z == 0 && N == V
+  LE = 0b1101, // meaning !(Z == 0 && N == V)
+  AL = 0b1110, // meaning ANY
+  NV = 0b1111  // ditto
+};
+
+/*
+ * certain addressing modes for load require pre or post writeback of
+ * the computed address to a base register
+ */
+enum WriteBack {
+  Post = 0,
+  Pre = 1
+};
+
+/*
+ * certain addressing modes for load require an offset to
+ * be optionally scaled so the decode needs to pass that
+ * through to the execute routine
+ */
+enum Scaling {
+  Unscaled = 0,
+  Scaled = 1
+};
+
+/*
+ * when we do have to scale we do so by shifting using
+ * log(bytes in data element - 1) as the shift count.
+ * so we don't have to scale offsets when loading
+ * bytes.
+ */
+enum ScaleShift {
+  ScaleShift16 = 1,
+  ScaleShift32 = 2,
+  ScaleShift64 = 3,
+  ScaleShift128 = 4
+};
+
+/*
+ * one of the addressing modes for load requires a 32-bit register
+ * value to be either zero- or sign-extended for these instructions
+ * UXTW or SXTW should be passed
+ *
+ * arithmetic register data processing operations can optionally
+ * extend a portion of the second register value for these
+ * instructions the value supplied must identify the portion of the
+ * register which is to be zero- or sign-exended
+ */
+enum Extension {
+  UXTB = 0,
+  UXTH = 1,
+  UXTW = 2,
+  UXTX = 3,
+  SXTB = 4,
+  SXTH = 5,
+  SXTW = 6,
+  SXTX = 7
+};
+
+/*
+ * arithmetic and logical register data processing operations
+ * optionally perform a shift on the second register value
+ */
+enum Shift {
+  LSL = 0,
+  LSR = 1,
+  ASR = 2,
+  ROR = 3
+};
+
+/*
+ * bit twiddling helpers for instruction decode
+ */
+
+// 32 bit mask with bits [hi,...,lo] set
+
+static inline u_int32_t mask32(int hi = 31, int lo = 0)
+{
+  int nbits = (hi + 1) - lo;
+  return ((1 << nbits) - 1) << lo;
+}
+
+static inline u_int64_t mask64(int hi = 63, int lo = 0)
+{
+  int nbits = (hi + 1) - lo;
+  return ((1L << nbits) - 1) << lo;
+}
+
+// pick bits [hi,...,lo] from val
+static inline u_int32_t pick32(u_int32_t val, int hi = 31, int lo = 0)
+{
+  return (val & mask32(hi, lo));
+}
+
+// pick bits [hi,...,lo] from val
+static inline u_int64_t pick64(u_int64_t val, int hi = 31, int lo = 0)
+{
+  return (val & mask64(hi, lo));
+}
+
+// pick bits [hi,...,lo] from val and shift to [(hi-(newlo - lo)),newlo]
+static inline u_int32_t pickshift32(u_int32_t val, int hi = 31,
+                                    int lo = 0, int newlo = 0)
+{
+  u_int32_t bits = pick32(val, hi, lo);
+  if (lo < newlo) {
+    return (bits << (newlo - lo));
+  } else {
+    return (bits >> (lo - newlo));
+  }
+}
+// mask [hi,lo] and shift down to start at bit 0
+static inline u_int32_t pickbits32(u_int32_t val, int hi = 31, int lo = 0)
+{
+  return (pick32(val, hi, lo) >> lo);
+}
+
+// mask [hi,lo] and shift down to start at bit 0
+static inline u_int64_t pickbits64(u_int64_t val, int hi = 63, int lo = 0)
+{
+  return (pick64(val, hi, lo) >> lo);
+}
+
+/*
+ * decode registers, immediates and constants of various types
+ */
+
+static inline GReg greg(u_int32_t val, int lo)
+{
+  return (GReg)pickbits32(val, lo + 4, lo);
+}
+
+static inline VReg vreg(u_int32_t val, int lo)
+{
+  return (VReg)pickbits32(val, lo + 4, lo);
+}
+
+static inline u_int32_t uimm(u_int32_t val, int hi, int lo)
+{
+  return pickbits32(val, hi, lo);
+}
+
+static inline int32_t simm(u_int32_t val, int hi = 31, int lo = 0) {
+  union {
+    u_int32_t u;
+    int32_t n;
+  };
+
+  u = val << (31 - hi);
+  n = n >> (31 - hi + lo);
+  return n;
+}
+
+static inline int64_t simm(u_int64_t val, int hi = 63, int lo = 0) {
+  union {
+    u_int64_t u;
+    int64_t n;
+  };
+
+  u = val << (63 - hi);
+  n = n >> (63 - hi + lo);
+  return n;
+}
+
+static inline Shift shift(u_int32_t val, int lo)
+{
+  return (Shift)pickbits32(val, lo+1, lo);
+}
+
+static inline Extension extension(u_int32_t val, int lo)
+{
+  return (Extension)pickbits32(val, lo+2, lo);
+}
+
+static inline Scaling scaling(u_int32_t val, int lo)
+{
+  return (Scaling)pickbits32(val, lo, lo);
+}
+
+static inline WriteBack writeback(u_int32_t val, int lo)
+{
+  return (WriteBack)pickbits32(val, lo, lo);
+}
+
+static inline CondCode condcode(u_int32_t val, int lo)
+{
+  return (CondCode)pickbits32(val, lo+3, lo);
+}
+
+/*
+ * operation decode
+ */
+// bits [28,25] are the primary dispatch vector
+
+static inline u_int32_t dispatchGroup(u_int32_t val)
+{
+  return pickshift32(val, 28, 25, 0);
+}
+
+/*
+ * the 16 possible values for bits [28,25] identified by tags which
+ * map them to the 5 main instruction groups LDST, DPREG, ADVSIMD,
+ * BREXSYS and DPIMM.
+ *
+ * An extra group PSEUDO is included in one of the unallocated ranges
+ * for simulator-specific pseudo-instructions.
+ */
+enum DispatchGroup {
+  GROUP_PSEUDO_0000,
+  GROUP_UNALLOC_0001,
+  GROUP_UNALLOC_0010,
+  GROUP_UNALLOC_0011,
+  GROUP_LDST_0100,
+  GROUP_DPREG_0101,
+  GROUP_LDST_0110,
+  GROUP_ADVSIMD_0111,
+  GROUP_DPIMM_1000,
+  GROUP_DPIMM_1001,
+  GROUP_BREXSYS_1010,
+  GROUP_BREXSYS_1011,
+  GROUP_LDST_1100,
+  GROUP_DPREG_1101,
+  GROUP_LDST_1110,
+  GROUP_ADVSIMD_1111
+};
+
+// bits [31, 29] of a Pseudo are the secondary dispatch vector
+
+static inline u_int32_t dispatchPseudo(u_int32_t val)
+{
+  return pickshift32(val, 31, 29, 0);
+}
+
+/*
+ * the 8 possible values for bits [31,29] in a Pseudo Instruction.
+ * Bits [28,25] are always 0000.
+ */
+
+enum DispatchPseudo {
+  PSEUDO_UNALLOC_000, // unallocated
+  PSEUDO_UNALLOC_001, // ditto
+  PSEUDO_UNALLOC_010, // ditto
+  PSEUDO_UNALLOC_011, // ditto
+  PSEUDO_UNALLOC_100, // ditto
+  PSEUDO_UNALLOC_101, // ditto
+  PSEUDO_CALLOUT_110, // CALLOUT -- bits [24,0] identify call/ret sig
+  PSEUDO_HALT_111     // HALT -- bits [24, 0] identify halt code
+};
+
+// bits [25, 23] of a DPImm are the secondary dispatch vector
+
+static inline u_int32_t dispatchDPImm(u_int32_t instr)
+{
+  return pickshift32(instr, 25, 23, 0);
+}
+
+/*
+ * the 8 possible values for bits [25,23] in a Data Processing Immediate
+ * Instruction. Bits [28,25] are always 100_.
+ */
+
+enum DispatchDPImm {
+  DPIMM_PCADR_000,  // PC-rel-addressing
+  DPIMM_PCADR_001,  // ditto
+  DPIMM_ADDSUB_010,  // Add/Subtract (immediate)
+  DPIMM_ADDSUB_011, // ditto
+  DPIMM_LOG_100,    // Logical (immediate)
+  DPIMM_MOV_101,    // Move Wide (immediate)
+  DPIMM_BITF_110,   // Bitfield
+  DPIMM_EXTR_111    // Extract
+};
+
+// bits [29,28:26] of a LS are the secondary dispatch vector
+
+static inline u_int32_t dispatchLS(u_int32_t instr)
+{
+  return (pickshift32(instr, 29, 28, 1) |
+          pickshift32(instr, 26, 26, 0));
+}
+
+/*
+ * the 8 possible values for bits [29,28:26] in a Load/Store
+ * Instruction. Bits [28,25] are always _1_0
+ */
+
+enum DispatchLS {
+  LS_EXCL_000,    // Load/store exclusive (includes some unallocated)
+  LS_ADVSIMD_001, // AdvSIMD load/store (various -- includes some unallocated)
+  LS_LIT_010,     // Load register literal (includes some unallocated)
+  LS_LIT_011,     // ditto
+  LS_PAIR_100,    // Load/store register pair (various)
+  LS_PAIR_101,    // ditto
+  LS_OTHER_110,   // other load/store formats
+  LS_OTHER_111    // ditto
+};
+
+// bits [28:24:21] of a DPReg are the secondary dispatch vector
+
+static inline u_int32_t dispatchDPReg(u_int32_t instr)
+{
+  return (pickshift32(instr, 28, 28, 2) |
+          pickshift32(instr, 24, 24, 1) |
+          pickshift32(instr, 21, 21, 0));
+}
+
+/*
+ * the 8 possible values for bits [28:24:21] in a Data Processing
+ * Register Instruction. Bits [28,25] are always _101
+ */
+
+enum DispatchDPReg {
+  DPREG_LOG_000,     // Logical (shifted register)
+  DPREG_LOG_001,     // ditto
+  DPREG_ADDSHF_010,  // Add/subtract (shifted register)
+  DPREG_ADDEXT_011,  // Add/subtract (extended register)
+  DPREG_ADDCOND_100, // Add/subtract (with carry) AND
+                     // Cond compare/select AND
+                     // Data Processing (1/2 source)
+  DPREG_UNALLOC_101, // Unallocated
+  DPREG_3SRC_110, // Data Processing (3 source)
+  DPREG_3SRC_111  // Data Processing (3 source)
+};
+
+// bits [31,29] of a BrExSys are the secondary dispatch vector
+
+static inline u_int32_t dispatchBrExSys(u_int32_t instr)
+{
+  return pickbits32(instr, 31, 29);
+}
+
+/*
+ * the 8 possible values for bits [31,29] in a Branch/Exception/System
+ * Instruction. Bits [28,25] are always 101_
+ */
+
+enum DispatchBr {
+  BR_IMM_000,     // Unconditional branch (immediate)
+  BR_IMMCMP_001,  // Compare & branch (immediate) AND
+                  // Test & branch (immediate)
+  BR_IMMCOND_010, // Conditional branch (immediate) AND Unallocated
+  BR_UNALLOC_011, // Unallocated
+  BR_IMM_100,     // Unconditional branch (immediate)
+  BR_IMMCMP_101,  // Compare & branch (immediate) AND
+                  // Test & branch (immediate)
+  BR_REG_110,     // Unconditional branch (register) AND System AND
+                  // Excn gen AND Unallocated
+  BR_UNALLOC_111  // Unallocated
+};
+
+/*
+ * TODO still need to provide secondary decode and dispatch for
+ * AdvSIMD Insructions with instr[28,25] = 0111 or 1111
+ */
+
+#endif // ifndef DECODE_H

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/depChecker_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "compiler/disassembler.hpp"
+#include "depChecker_aarch64.hpp"
+
+// Nothing to do on aarch64

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/depChecker_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_DEPCHECKER_AARCH64_HPP
+#define CPU_AARCH64_VM_DEPCHECKER_AARCH64_HPP
+
+// Nothing to do on aarch64
+
+#endif // CPU_AARCH64_VM_DEPCHECKER_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/disassembler_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_DISASSEMBLER_AARCH64_HPP
+#define CPU_AARCH64_VM_DISASSEMBLER_AARCH64_HPP
+
+  static int pd_instruction_alignment() {
+    return 1;
+  }
+
+  static const char* pd_cpu_opts() {
+    return "";
+  }
+
+#endif // CPU_AARCH64_VM_DISASSEMBLER_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/frame_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,832 @@
+/*
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "interpreter/interpreter.hpp"
+#include "memory/resourceArea.hpp"
+#include "oops/markOop.hpp"
+#include "oops/method.hpp"
+#include "oops/oop.inline.hpp"
+#include "prims/methodHandles.hpp"
+#include "runtime/frame.inline.hpp"
+#include "runtime/handles.inline.hpp"
+#include "runtime/javaCalls.hpp"
+#include "runtime/monitorChunk.hpp"
+#include "runtime/os.hpp"
+#include "runtime/signature.hpp"
+#include "runtime/stubCodeGenerator.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "vmreg_aarch64.inline.hpp"
+#ifdef COMPILER1
+#include "c1/c1_Runtime1.hpp"
+#include "runtime/vframeArray.hpp"
+#endif
+
+#ifdef ASSERT
+void RegisterMap::check_location_valid() {
+}
+#endif
+
+
+// Profiling/safepoint support
+
+bool frame::safe_for_sender(JavaThread *thread) {
+  address   sp = (address)_sp;
+  address   fp = (address)_fp;
+  address   unextended_sp = (address)_unextended_sp;
+
+  // consider stack guards when trying to determine "safe" stack pointers
+  static size_t stack_guard_size = os::uses_stack_guard_pages() ? (StackYellowPages + StackRedPages) * os::vm_page_size() : 0;
+  size_t usable_stack_size = thread->stack_size() - stack_guard_size;
+
+  // sp must be within the usable part of the stack (not in guards)
+  bool sp_safe = (sp < thread->stack_base()) &&
+                 (sp >= thread->stack_base() - usable_stack_size);
+
+
+  if (!sp_safe) {
+    return false;
+  }
+
+  // unextended sp must be within the stack and above or equal sp
+  bool unextended_sp_safe = (unextended_sp < thread->stack_base()) &&
+                            (unextended_sp >= sp);
+
+  if (!unextended_sp_safe) {
+    return false;
+  }
+
+  // an fp must be within the stack and above (but not equal) sp
+  // second evaluation on fp+ is added to handle situation where fp is -1
+  bool fp_safe = (fp < thread->stack_base() && (fp > sp) && (((fp + (return_addr_offset * sizeof(void*))) < thread->stack_base())));
+
+  // We know sp/unextended_sp are safe only fp is questionable here
+
+  // If the current frame is known to the code cache then we can attempt to
+  // to construct the sender and do some validation of it. This goes a long way
+  // toward eliminating issues when we get in frame construction code
+
+  if (_cb != NULL ) {
+
+    // First check if frame is complete and tester is reliable
+    // Unfortunately we can only check frame complete for runtime stubs and nmethod
+    // other generic buffer blobs are more problematic so we just assume they are
+    // ok. adapter blobs never have a frame complete and are never ok.
+
+    if (!_cb->is_frame_complete_at(_pc)) {
+      if (_cb->is_nmethod() || _cb->is_adapter_blob() || _cb->is_runtime_stub()) {
+        return false;
+      }
+    }
+
+    // Could just be some random pointer within the codeBlob
+    if (!_cb->code_contains(_pc)) {
+      return false;
+    }
+
+    // Entry frame checks
+    if (is_entry_frame()) {
+      // an entry frame must have a valid fp.
+
+      if (!fp_safe) return false;
+
+      // Validate the JavaCallWrapper an entry frame must have
+
+      address jcw = (address)entry_frame_call_wrapper();
+
+      bool jcw_safe = (jcw < thread->stack_base()) && ( jcw > fp);
+
+      return jcw_safe;
+
+    }
+
+    intptr_t* sender_sp = NULL;
+    address   sender_pc = NULL;
+
+    if (is_interpreted_frame()) {
+      // fp must be safe
+      if (!fp_safe) {
+        return false;
+      }
+
+      sender_pc = (address) this->fp()[return_addr_offset];
+      sender_sp = (intptr_t*) addr_at(sender_sp_offset);
+
+    } else {
+      // must be some sort of compiled/runtime frame
+      // fp does not have to be safe (although it could be check for c1?)
+
+      // check for a valid frame_size, otherwise we are unlikely to get a valid sender_pc
+      if (_cb->frame_size() <= 0) {
+        return false;
+      }
+
+      sender_sp = _unextended_sp + _cb->frame_size();
+      sender_pc = (address) *(sender_sp-1);
+    }
+
+
+    // If the potential sender is the interpreter then we can do some more checking
+    if (Interpreter::contains(sender_pc)) {
+
+      // fp is always saved in a recognizable place in any code we generate. However
+      // only if the sender is interpreted/call_stub (c1 too?) are we certain that the saved fp
+      // is really a frame pointer.
+
+      intptr_t *saved_fp = (intptr_t*)*(sender_sp - frame::sender_sp_offset);
+      bool saved_fp_safe = ((address)saved_fp < thread->stack_base()) && (saved_fp > sender_sp);
+
+      if (!saved_fp_safe) {
+        return false;
+      }
+
+      // construct the potential sender
+
+      frame sender(sender_sp, saved_fp, sender_pc);
+
+      return sender.is_interpreted_frame_valid(thread);
+
+    }
+
+    // We must always be able to find a recognizable pc
+    CodeBlob* sender_blob = CodeCache::find_blob_unsafe(sender_pc);
+    if (sender_pc == NULL ||  sender_blob == NULL) {
+      return false;
+    }
+
+    // Could be a zombie method
+    if (sender_blob->is_zombie() || sender_blob->is_unloaded()) {
+      return false;
+    }
+
+    // Could just be some random pointer within the codeBlob
+    if (!sender_blob->code_contains(sender_pc)) {
+      return false;
+    }
+
+    // We should never be able to see an adapter if the current frame is something from code cache
+    if (sender_blob->is_adapter_blob()) {
+      return false;
+    }
+
+    // Could be the call_stub
+    if (StubRoutines::returns_to_call_stub(sender_pc)) {
+      intptr_t *saved_fp = (intptr_t*)*(sender_sp - frame::sender_sp_offset);
+      bool saved_fp_safe = ((address)saved_fp < thread->stack_base()) && (saved_fp > sender_sp);
+
+      if (!saved_fp_safe) {
+        return false;
+      }
+
+      // construct the potential sender
+
+      frame sender(sender_sp, saved_fp, sender_pc);
+
+      // Validate the JavaCallWrapper an entry frame must have
+      address jcw = (address)sender.entry_frame_call_wrapper();
+
+      bool jcw_safe = (jcw < thread->stack_base()) && ( jcw > (address)sender.fp());
+
+      return jcw_safe;
+    }
+
+    if (sender_blob->is_nmethod()) {
+        nmethod* nm = sender_blob->as_nmethod_or_null();
+        if (nm != NULL) {
+            if (nm->is_deopt_mh_entry(sender_pc) || nm->is_deopt_entry(sender_pc)) {
+                return false;
+            }
+        }
+    }
+
+    // If the frame size is 0 something (or less) is bad because every nmethod has a non-zero frame size
+    // because the return address counts against the callee's frame.
+
+    if (sender_blob->frame_size() <= 0) {
+      assert(!sender_blob->is_nmethod(), "should count return address at least");
+      return false;
+    }
+
+    // We should never be able to see anything here except an nmethod. If something in the
+    // code cache (current frame) is called by an entity within the code cache that entity
+    // should not be anything but the call stub (already covered), the interpreter (already covered)
+    // or an nmethod.
+
+    if (!sender_blob->is_nmethod()) {
+        return false;
+    }
+
+    // Could put some more validation for the potential non-interpreted sender
+    // frame we'd create by calling sender if I could think of any. Wait for next crash in forte...
+
+    // One idea is seeing if the sender_pc we have is one that we'd expect to call to current cb
+
+    // We've validated the potential sender that would be created
+    return true;
+  }
+
+  // Must be native-compiled frame. Since sender will try and use fp to find
+  // linkages it must be safe
+
+  if (!fp_safe) {
+    return false;
+  }
+
+  // Will the pc we fetch be non-zero (which we'll find at the oldest frame)
+
+  if ( (address) this->fp()[return_addr_offset] == NULL) return false;
+
+
+  // could try and do some more potential verification of native frame if we could think of some...
+
+  return true;
+
+}
+
+void frame::patch_pc(Thread* thread, address pc) {
+  address* pc_addr = &(((address*) sp())[-1]);
+  if (TracePcPatching) {
+    tty->print_cr("patch_pc at address " INTPTR_FORMAT " [" INTPTR_FORMAT " -> " INTPTR_FORMAT "]",
+                  p2i(pc_addr), p2i(*pc_addr), p2i(pc));
+  }
+  // Either the return address is the original one or we are going to
+  // patch in the same address that's already there.
+  assert(_pc == *pc_addr || pc == *pc_addr, "must be");
+  *pc_addr = pc;
+  _cb = CodeCache::find_blob(pc);
+  address original_pc = nmethod::get_deopt_original_pc(this);
+  if (original_pc != NULL) {
+    assert(original_pc == _pc, "expected original PC to be stored before patching");
+    _deopt_state = is_deoptimized;
+    // leave _pc as is
+  } else {
+    _deopt_state = not_deoptimized;
+    _pc = pc;
+  }
+}
+
+bool frame::is_interpreted_frame() const  {
+  return Interpreter::contains(pc());
+}
+
+int frame::frame_size(RegisterMap* map) const {
+  frame sender = this->sender(map);
+  return sender.sp() - sp();
+}
+
+intptr_t* frame::entry_frame_argument_at(int offset) const {
+  // convert offset to index to deal with tsi
+  int index = (Interpreter::expr_offset_in_bytes(offset)/wordSize);
+  // Entry frame's arguments are always in relation to unextended_sp()
+  return &unextended_sp()[index];
+}
+
+// sender_sp
+#ifdef CC_INTERP
+intptr_t* frame::interpreter_frame_sender_sp() const {
+  assert(is_interpreted_frame(), "interpreted frame expected");
+  // QQQ why does this specialize method exist if frame::sender_sp() does same thing?
+  // seems odd and if we always know interpreted vs. non then sender_sp() is really
+  // doing too much work.
+  return get_interpreterState()->sender_sp();
+}
+
+// monitor elements
+
+BasicObjectLock* frame::interpreter_frame_monitor_begin() const {
+  return get_interpreterState()->monitor_base();
+}
+
+BasicObjectLock* frame::interpreter_frame_monitor_end() const {
+  return (BasicObjectLock*) get_interpreterState()->stack_base();
+}
+
+#else // CC_INTERP
+
+intptr_t* frame::interpreter_frame_sender_sp() const {
+  assert(is_interpreted_frame(), "interpreted frame expected");
+  return (intptr_t*) at(interpreter_frame_sender_sp_offset);
+}
+
+void frame::set_interpreter_frame_sender_sp(intptr_t* sender_sp) {
+  assert(is_interpreted_frame(), "interpreted frame expected");
+  ptr_at_put(interpreter_frame_sender_sp_offset, (intptr_t) sender_sp);
+}
+
+
+// monitor elements
+
+BasicObjectLock* frame::interpreter_frame_monitor_begin() const {
+  return (BasicObjectLock*) addr_at(interpreter_frame_monitor_block_bottom_offset);
+}
+
+BasicObjectLock* frame::interpreter_frame_monitor_end() const {
+  BasicObjectLock* result = (BasicObjectLock*) *addr_at(interpreter_frame_monitor_block_top_offset);
+  // make sure the pointer points inside the frame
+  assert(sp() <= (intptr_t*) result, "monitor end should be above the stack pointer");
+  assert((intptr_t*) result < fp(),  "monitor end should be strictly below the frame pointer");
+  return result;
+}
+
+void frame::interpreter_frame_set_monitor_end(BasicObjectLock* value) {
+  *((BasicObjectLock**)addr_at(interpreter_frame_monitor_block_top_offset)) = value;
+}
+
+// Used by template based interpreter deoptimization
+void frame::interpreter_frame_set_last_sp(intptr_t* sp) {
+    *((intptr_t**)addr_at(interpreter_frame_last_sp_offset)) = sp;
+}
+#endif // CC_INTERP
+
+frame frame::sender_for_entry_frame(RegisterMap* map) const {
+  assert(map != NULL, "map must be set");
+  // Java frame called from C; skip all C frames and return top C
+  // frame of that chunk as the sender
+  JavaFrameAnchor* jfa = entry_frame_call_wrapper()->anchor();
+  assert(!entry_frame_is_first(), "next Java fp must be non zero");
+  assert(jfa->last_Java_sp() > sp(), "must be above this frame on stack");
+  map->clear();
+  assert(map->include_argument_oops(), "should be set by clear");
+  if (jfa->last_Java_pc() != NULL ) {
+    frame fr(jfa->last_Java_sp(), jfa->last_Java_fp(), jfa->last_Java_pc());
+    return fr;
+  }
+  frame fr(jfa->last_Java_sp(), jfa->last_Java_fp());
+  return fr;
+}
+
+//------------------------------------------------------------------------------
+// frame::verify_deopt_original_pc
+//
+// Verifies the calculated original PC of a deoptimization PC for the
+// given unextended SP.  The unextended SP might also be the saved SP
+// for MethodHandle call sites.
+#ifdef ASSERT
+void frame::verify_deopt_original_pc(nmethod* nm, intptr_t* unextended_sp, bool is_method_handle_return) {
+  frame fr;
+
+  // This is ugly but it's better than to change {get,set}_original_pc
+  // to take an SP value as argument.  And it's only a debugging
+  // method anyway.
+  fr._unextended_sp = unextended_sp;
+
+  address original_pc = nm->get_original_pc(&fr);
+  assert(nm->insts_contains(original_pc), "original PC must be in nmethod");
+  assert(nm->is_method_handle_return(original_pc) == is_method_handle_return, "must be");
+}
+#endif
+
+//------------------------------------------------------------------------------
+// frame::adjust_unextended_sp
+void frame::adjust_unextended_sp() {
+  // If we are returning to a compiled MethodHandle call site, the
+  // saved_fp will in fact be a saved value of the unextended SP.  The
+  // simplest way to tell whether we are returning to such a call site
+  // is as follows:
+
+  nmethod* sender_nm = (_cb == NULL) ? NULL : _cb->as_nmethod_or_null();
+  if (sender_nm != NULL) {
+    // If the sender PC is a deoptimization point, get the original
+    // PC.  For MethodHandle call site the unextended_sp is stored in
+    // saved_fp.
+    if (sender_nm->is_deopt_mh_entry(_pc)) {
+      DEBUG_ONLY(verify_deopt_mh_original_pc(sender_nm, _fp));
+      _unextended_sp = _fp;
+    }
+    else if (sender_nm->is_deopt_entry(_pc)) {
+      DEBUG_ONLY(verify_deopt_original_pc(sender_nm, _unextended_sp));
+    }
+    else if (sender_nm->is_method_handle_return(_pc)) {
+      _unextended_sp = _fp;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// frame::update_map_with_saved_link
+void frame::update_map_with_saved_link(RegisterMap* map, intptr_t** link_addr) {
+  // The interpreter and compiler(s) always save fp in a known
+  // location on entry. We must record where that location is
+  // so that if fp was live on callout from c2 we can find
+  // the saved copy no matter what it called.
+
+  // Since the interpreter always saves fp if we record where it is then
+  // we don't have to always save fp on entry and exit to c2 compiled
+  // code, on entry will be enough.
+  map->set_location(rfp->as_VMReg(), (address) link_addr);
+  // this is weird "H" ought to be at a higher address however the
+  // oopMaps seems to have the "H" regs at the same address and the
+  // vanilla register.
+  // XXXX make this go away
+  if (true) {
+    map->set_location(rfp->as_VMReg()->next(), (address) link_addr);
+  }
+}
+
+
+//------------------------------------------------------------------------------
+// frame::sender_for_interpreter_frame
+frame frame::sender_for_interpreter_frame(RegisterMap* map) const {
+  // SP is the raw SP from the sender after adapter or interpreter
+  // extension.
+  intptr_t* sender_sp = this->sender_sp();
+
+  // This is the sp before any possible extension (adapter/locals).
+  intptr_t* unextended_sp = interpreter_frame_sender_sp();
+
+#ifdef COMPILER2
+  if (map->update_map()) {
+    update_map_with_saved_link(map, (intptr_t**) addr_at(link_offset));
+  }
+#endif // COMPILER2
+
+  return frame(sender_sp, unextended_sp, link(), sender_pc());
+}
+
+
+//------------------------------------------------------------------------------
+// frame::sender_for_compiled_frame
+frame frame::sender_for_compiled_frame(RegisterMap* map) const {
+  // we cannot rely upon the last fp having been saved to the thread
+  // in C2 code but it will have been pushed onto the stack. so we
+  // have to find it relative to the unextended sp
+
+  assert(_cb->frame_size() >= 0, "must have non-zero frame size");
+  intptr_t* l_sender_sp = unextended_sp() + _cb->frame_size();
+  intptr_t* unextended_sp = l_sender_sp;
+
+  // the return_address is always the word on the stack
+  address sender_pc = (address) *(l_sender_sp-1);
+
+  intptr_t** saved_fp_addr = (intptr_t**) (l_sender_sp - frame::sender_sp_offset);
+
+  // assert (sender_sp() == l_sender_sp, "should be");
+  // assert (*saved_fp_addr == link(), "should be");
+
+  if (map->update_map()) {
+    // Tell GC to use argument oopmaps for some runtime stubs that need it.
+    // For C1, the runtime stub might not have oop maps, so set this flag
+    // outside of update_register_map.
+    map->set_include_argument_oops(_cb->caller_must_gc_arguments(map->thread()));
+    if (_cb->oop_maps() != NULL) {
+      OopMapSet::update_register_map(this, map);
+    }
+
+    // Since the prolog does the save and restore of FP there is no
+    // oopmap for it so we must fill in its location as if there was
+    // an oopmap entry since if our caller was compiled code there
+    // could be live jvm state in it.
+    update_map_with_saved_link(map, saved_fp_addr);
+  }
+
+  return frame(l_sender_sp, unextended_sp, *saved_fp_addr, sender_pc);
+}
+
+//------------------------------------------------------------------------------
+// frame::sender
+frame frame::sender(RegisterMap* map) const {
+  // Default is we done have to follow them. The sender_for_xxx will
+  // update it accordingly
+   map->set_include_argument_oops(false);
+
+  if (is_entry_frame())
+    return sender_for_entry_frame(map);
+  if (is_interpreted_frame())
+    return sender_for_interpreter_frame(map);
+  assert(_cb == CodeCache::find_blob(pc()),"Must be the same");
+
+  // This test looks odd: why is it not is_compiled_frame() ?  That's
+  // because stubs also have OOP maps.
+  if (_cb != NULL) {
+    return sender_for_compiled_frame(map);
+  }
+
+  // Must be native-compiled frame, i.e. the marshaling code for native
+  // methods that exists in the core system.
+  return frame(sender_sp(), link(), sender_pc());
+}
+
+bool frame::interpreter_frame_equals_unpacked_fp(intptr_t* fp) {
+  assert(is_interpreted_frame(), "must be interpreter frame");
+  Method* method = interpreter_frame_method();
+  // When unpacking an optimized frame the frame pointer is
+  // adjusted with:
+  int diff = (method->max_locals() - method->size_of_parameters()) *
+             Interpreter::stackElementWords;
+  return _fp == (fp - diff);
+}
+
+bool frame::is_interpreted_frame_valid(JavaThread* thread) const {
+// QQQ
+#ifdef CC_INTERP
+#else
+  assert(is_interpreted_frame(), "Not an interpreted frame");
+  // These are reasonable sanity checks
+  if (fp() == 0 || (intptr_t(fp()) & (wordSize-1)) != 0) {
+    return false;
+  }
+  if (sp() == 0 || (intptr_t(sp()) & (wordSize-1)) != 0) {
+    return false;
+  }
+  if (fp() + interpreter_frame_initial_sp_offset < sp()) {
+    return false;
+  }
+  // These are hacks to keep us out of trouble.
+  // The problem with these is that they mask other problems
+  if (fp() <= sp()) {        // this attempts to deal with unsigned comparison above
+    return false;
+  }
+
+  // do some validation of frame elements
+
+  // first the method
+
+  Method* m = *interpreter_frame_method_addr();
+
+  // validate the method we'd find in this potential sender
+  if (!m->is_valid_method()) return false;
+
+  // stack frames shouldn't be much larger than max_stack elements
+
+  if (fp() - sp() > 1024 + m->max_stack()*Interpreter::stackElementSize) {
+    return false;
+  }
+
+  // validate bci/bcx
+
+  address  bcp    = interpreter_frame_bcp();
+  if (m->validate_bci_from_bcp(bcp) < 0) {
+    return false;
+  }
+
+  // validate constantPoolCache*
+  ConstantPoolCache* cp = *interpreter_frame_cache_addr();
+  if (cp == NULL || !cp->is_metaspace_object()) return false;
+
+  // validate locals
+
+  address locals =  (address) *interpreter_frame_locals_addr();
+
+  if (locals > thread->stack_base() || locals < (address) fp()) return false;
+
+  // We'd have to be pretty unlucky to be mislead at this point
+
+#endif // CC_INTERP
+  return true;
+}
+
+BasicType frame::interpreter_frame_result(oop* oop_result, jvalue* value_result) {
+#ifdef CC_INTERP
+  // Needed for JVMTI. The result should always be in the
+  // interpreterState object
+  interpreterState istate = get_interpreterState();
+#endif // CC_INTERP
+  assert(is_interpreted_frame(), "interpreted frame expected");
+  Method* method = interpreter_frame_method();
+  BasicType type = method->result_type();
+
+  intptr_t* tos_addr;
+  if (method->is_native()) {
+    // TODO : ensure AARCH64 does the same as Intel here i.e. push v0 then r0
+    // Prior to calling into the runtime to report the method_exit the possible
+    // return value is pushed to the native stack. If the result is a jfloat/jdouble
+    // then ST0 is saved before EAX/EDX. See the note in generate_native_result
+    tos_addr = (intptr_t*)sp();
+    if (type == T_FLOAT || type == T_DOUBLE) {
+      // This is times two because we do a push(ltos) after pushing XMM0
+      // and that takes two interpreter stack slots.
+      tos_addr += 2 * Interpreter::stackElementWords;
+    }
+  } else {
+    tos_addr = (intptr_t*)interpreter_frame_tos_address();
+  }
+
+  switch (type) {
+    case T_OBJECT  :
+    case T_ARRAY   : {
+      oop obj;
+      if (method->is_native()) {
+#ifdef CC_INTERP
+        obj = istate->_oop_temp;
+#else
+        obj = cast_to_oop(at(interpreter_frame_oop_temp_offset));
+#endif // CC_INTERP
+      } else {
+        oop* obj_p = (oop*)tos_addr;
+        obj = (obj_p == NULL) ? (oop)NULL : *obj_p;
+      }
+      assert(obj == NULL || Universe::heap()->is_in(obj), "sanity check");
+      *oop_result = obj;
+      break;
+    }
+    case T_BOOLEAN : value_result->z = *(jboolean*)tos_addr; break;
+    case T_BYTE    : value_result->b = *(jbyte*)tos_addr; break;
+    case T_CHAR    : value_result->c = *(jchar*)tos_addr; break;
+    case T_SHORT   : value_result->s = *(jshort*)tos_addr; break;
+    case T_INT     : value_result->i = *(jint*)tos_addr; break;
+    case T_LONG    : value_result->j = *(jlong*)tos_addr; break;
+    case T_FLOAT   : {
+        value_result->f = *(jfloat*)tos_addr;
+      break;
+    }
+    case T_DOUBLE  : value_result->d = *(jdouble*)tos_addr; break;
+    case T_VOID    : /* Nothing to do */ break;
+    default        : ShouldNotReachHere();
+  }
+
+  return type;
+}
+
+
+intptr_t* frame::interpreter_frame_tos_at(jint offset) const {
+  int index = (Interpreter::expr_offset_in_bytes(offset)/wordSize);
+  return &interpreter_frame_tos_address()[index];
+}
+
+#ifndef PRODUCT
+
+#define DESCRIBE_FP_OFFSET(name) \
+  values.describe(frame_no, fp() + frame::name##_offset, #name)
+
+void frame::describe_pd(FrameValues& values, int frame_no) {
+  if (is_interpreted_frame()) {
+    DESCRIBE_FP_OFFSET(interpreter_frame_sender_sp);
+    DESCRIBE_FP_OFFSET(interpreter_frame_last_sp);
+    DESCRIBE_FP_OFFSET(interpreter_frame_method);
+    DESCRIBE_FP_OFFSET(interpreter_frame_mdp);
+    DESCRIBE_FP_OFFSET(interpreter_frame_cache);
+    DESCRIBE_FP_OFFSET(interpreter_frame_locals);
+    DESCRIBE_FP_OFFSET(interpreter_frame_bcp);
+    DESCRIBE_FP_OFFSET(interpreter_frame_initial_sp);
+  }
+}
+#endif
+
+intptr_t *frame::initial_deoptimization_info() {
+  // Not used on aarch64, but we must return something.
+  return NULL;
+}
+
+intptr_t* frame::real_fp() const {
+  if (_cb != NULL) {
+    // use the frame size if valid
+    int size = _cb->frame_size();
+    if (size > 0) {
+      return unextended_sp() + size;
+    }
+  }
+  // else rely on fp()
+  assert(! is_compiled_frame(), "unknown compiled frame size");
+  return fp();
+}
+
+#undef DESCRIBE_FP_OFFSET
+
+#define DESCRIBE_FP_OFFSET(name)                                        \
+  {                                                                     \
+    unsigned long *p = (unsigned long *)fp;                             \
+    printf("0x%016lx 0x%016lx %s\n", (unsigned long)(p + frame::name##_offset), \
+           p[frame::name##_offset], #name);                             \
+  }
+
+static __thread unsigned long nextfp;
+static __thread unsigned long nextpc;
+static __thread unsigned long nextsp;
+static __thread RegisterMap *reg_map;
+
+static void printbc(Method *m, intptr_t bcx) {
+  const char *name;
+  char buf[16];
+  if (m->validate_bci_from_bcp((address)bcx) < 0
+      || !m->contains((address)bcx)) {
+    name = "???";
+    snprintf(buf, sizeof buf, "(bad)");
+  } else {
+    int bci = m->bci_from((address)bcx);
+    snprintf(buf, sizeof buf, "%d", bci);
+    name = Bytecodes::name(m->code_at(bci));
+  }
+  ResourceMark rm;
+  printf("%s : %s ==> %s\n", m->name_and_sig_as_C_string(), buf, name);
+}
+
+void internal_pf(unsigned long sp, unsigned long fp, unsigned long pc, unsigned long bcx) {
+  if (! fp)
+    return;
+
+  DESCRIBE_FP_OFFSET(return_addr);
+  DESCRIBE_FP_OFFSET(link);
+  DESCRIBE_FP_OFFSET(interpreter_frame_sender_sp);
+  DESCRIBE_FP_OFFSET(interpreter_frame_last_sp);
+  DESCRIBE_FP_OFFSET(interpreter_frame_method);
+  DESCRIBE_FP_OFFSET(interpreter_frame_mdp);
+  DESCRIBE_FP_OFFSET(interpreter_frame_cache);
+  DESCRIBE_FP_OFFSET(interpreter_frame_locals);
+  DESCRIBE_FP_OFFSET(interpreter_frame_bcp);
+  DESCRIBE_FP_OFFSET(interpreter_frame_initial_sp);
+  unsigned long *p = (unsigned long *)fp;
+
+  // We want to see all frames, native and Java.  For compiled and
+  // interpreted frames we have special information that allows us to
+  // unwind them; for everything else we assume that the native frame
+  // pointer chain is intact.
+  frame this_frame((intptr_t*)sp, (intptr_t*)fp, (address)pc);
+  if (this_frame.is_compiled_frame() ||
+      this_frame.is_interpreted_frame()) {
+    frame sender = this_frame.sender(reg_map);
+    nextfp = (unsigned long)sender.fp();
+    nextpc = (unsigned long)sender.pc();
+    nextsp = (unsigned long)sender.unextended_sp();
+  } else {
+    nextfp = p[frame::link_offset];
+    nextpc = p[frame::return_addr_offset];
+    nextsp = (unsigned long)&p[frame::sender_sp_offset];
+  }
+
+  if (bcx == -1ul)
+    bcx = p[frame::interpreter_frame_bcp_offset];
+
+  if (Interpreter::contains((address)pc)) {
+    Method* m = (Method*)p[frame::interpreter_frame_method_offset];
+    if(m && m->is_method()) {
+      printbc(m, bcx);
+    } else
+      printf("not a Method\n");
+  } else {
+    CodeBlob *cb = CodeCache::find_blob((address)pc);
+    if (cb != NULL) {
+      if (cb->is_nmethod()) {
+        ResourceMark rm;
+        nmethod* nm = (nmethod*)cb;
+        printf("nmethod %s\n", nm->method()->name_and_sig_as_C_string());
+      } else if (cb->name()) {
+        printf("CodeBlob %s\n", cb->name());
+      }
+    }
+  }
+}
+
+extern "C" void npf() {
+  CodeBlob *cb = CodeCache::find_blob((address)nextpc);
+  // C2 does not always chain the frame pointers when it can, instead
+  // preferring to use fixed offsets from SP, so a simple leave() does
+  // not work.  Instead, it adds the frame size to SP then pops FP and
+  // LR.  We have to do the same thing to get a good call chain.
+  if (cb && cb->frame_size())
+    nextfp = nextsp + wordSize * (cb->frame_size() - 2);
+  internal_pf (nextsp, nextfp, nextpc, -1);
+}
+
+extern "C" void pf(unsigned long sp, unsigned long fp, unsigned long pc,
+                   unsigned long bcx, unsigned long thread) {
+  RegisterMap map((JavaThread*)thread, false);
+  if (!reg_map) {
+    reg_map = (RegisterMap*)os::malloc(sizeof map, mtNone);
+  }
+  memcpy(reg_map, &map, sizeof map);
+  {
+    CodeBlob *cb = CodeCache::find_blob((address)pc);
+    if (cb && cb->frame_size())
+      fp = sp + wordSize * (cb->frame_size() - 2);
+  }
+  internal_pf(sp, fp, pc, bcx);
+}
+
+// support for printing out where we are in a Java method
+// needs to be passed current fp and bcp register values
+// prints method name, bc index and bytecode name
+extern "C" void pm(unsigned long fp, unsigned long bcx) {
+  DESCRIBE_FP_OFFSET(interpreter_frame_method);
+  unsigned long *p = (unsigned long *)fp;
+  Method* m = (Method*)p[frame::interpreter_frame_method_offset];
+  printbc(m, bcx);
+}
+
+#ifndef PRODUCT
+// This is a generic constructor which is only used by pns() in debug.cpp.
+frame::frame(void* sp, void* fp, void* pc) {
+  init((intptr_t*)sp, (intptr_t*)fp, (address)pc);
+}
+#endif

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/frame_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_FRAME_AARCH64_HPP
+#define CPU_AARCH64_VM_FRAME_AARCH64_HPP
+
+#include "runtime/synchronizer.hpp"
+#include "utilities/top.hpp"
+
+// A frame represents a physical stack frame (an activation).  Frames can be
+// C or Java frames, and the Java frames can be interpreted or compiled.
+// In contrast, vframes represent source-level activations, so that one physical frame
+// can correspond to multiple source level frames because of inlining.
+// A frame is comprised of {pc, fp, sp}
+// ------------------------------ Asm interpreter ----------------------------------------
+// Layout of asm interpreter frame:
+//    [expression stack      ] * <- sp
+
+//    [monitors[0]           ]   \
+//     ...                        | monitor block size = k
+//    [monitors[k-1]         ]   /
+//    [frame initial esp     ] ( == &monitors[0], initially here)       initial_sp_offset
+//    [byte code index/pointr]                   = bcx()                bcx_offset
+
+//    [pointer to locals     ]                   = locals()             locals_offset
+//    [constant pool cache   ]                   = cache()              cache_offset
+
+//    [methodData            ]                   = mdp()                mdx_offset
+//    [methodOop             ]                   = method()             method_offset
+
+//    [last esp              ]                   = last_sp()            last_sp_offset
+//    [old stack pointer     ]                     (sender_sp)          sender_sp_offset
+
+//    [old frame pointer     ]   <- fp           = link()
+//    [return pc             ]
+
+//    [last sp               ]
+//    [oop temp              ]                     (only for native calls)
+
+//    [locals and parameters ]
+//                               <- sender sp
+// ------------------------------ Asm interpreter ----------------------------------------
+
+// ------------------------------ C++ interpreter ----------------------------------------
+//
+// Layout of C++ interpreter frame: (While executing in BytecodeInterpreter::run)
+//
+//                             <- SP (current esp/rsp)
+//    [local variables         ] BytecodeInterpreter::run local variables
+//    ...                        BytecodeInterpreter::run local variables
+//    [local variables         ] BytecodeInterpreter::run local variables
+//    [old frame pointer       ]   fp [ BytecodeInterpreter::run's ebp/rbp ]
+//    [return pc               ]  (return to frame manager)
+//    [interpreter_state*      ]  (arg to BytecodeInterpreter::run)   --------------
+//    [expression stack        ] <- last_Java_sp                           |
+//    [...                     ] * <- interpreter_state.stack              |
+//    [expression stack        ] * <- interpreter_state.stack_base         |
+//    [monitors                ]   \                                       |
+//     ...                          | monitor block size                   |
+//    [monitors                ]   / <- interpreter_state.monitor_base     |
+//    [struct interpretState   ] <-----------------------------------------|
+//    [return pc               ] (return to callee of frame manager [1]
+//    [locals and parameters   ]
+//                               <- sender sp
+
+// [1] When the c++ interpreter calls a new method it returns to the frame
+//     manager which allocates a new frame on the stack. In that case there
+//     is no real callee of this newly allocated frame. The frame manager is
+//     aware of the  additional frame(s) and will pop them as nested calls
+//     complete. Howevers tTo make it look good in the debugger the frame
+//     manager actually installs a dummy pc pointing to RecursiveInterpreterActivation
+//     with a fake interpreter_state* parameter to make it easy to debug
+//     nested calls.
+
+// Note that contrary to the layout for the assembly interpreter the
+// expression stack allocated for the C++ interpreter is full sized.
+// However this is not as bad as it seems as the interpreter frame_manager
+// will truncate the unused space on succesive method calls.
+//
+// ------------------------------ C++ interpreter ----------------------------------------
+
+ public:
+  enum {
+    pc_return_offset                                 =  0,
+    // All frames
+    link_offset                                      =  0,
+    return_addr_offset                               =  1,
+    sender_sp_offset                                 =  2,
+
+#ifndef CC_INTERP
+
+    // Interpreter frames
+    interpreter_frame_oop_temp_offset                =  3, // for native calls only
+
+    interpreter_frame_sender_sp_offset               = -1,
+    // outgoing sp before a call to an invoked method
+    interpreter_frame_last_sp_offset                 = interpreter_frame_sender_sp_offset - 1,
+    interpreter_frame_method_offset                  = interpreter_frame_last_sp_offset - 1,
+    interpreter_frame_mdp_offset                     = interpreter_frame_method_offset - 1,
+    interpreter_frame_cache_offset                   = interpreter_frame_mdp_offset - 1,
+    interpreter_frame_locals_offset                  = interpreter_frame_cache_offset - 1,
+    interpreter_frame_bcp_offset                     = interpreter_frame_locals_offset - 1,
+    interpreter_frame_initial_sp_offset              = interpreter_frame_bcp_offset - 1,
+
+    interpreter_frame_monitor_block_top_offset       = interpreter_frame_initial_sp_offset,
+    interpreter_frame_monitor_block_bottom_offset    = interpreter_frame_initial_sp_offset,
+
+#endif // CC_INTERP
+
+    // Entry frames
+    // n.b. these values are determined by the layout defined in
+    // stubGenerator for the Java call stub
+    entry_frame_after_call_words                     = 27,
+    entry_frame_call_wrapper_offset                  = -8,
+
+    // we don't need a save area
+    arg_reg_save_area_bytes                          =  0,
+
+    // TODO - check that this is still correct
+    // Native frames
+
+    native_frame_initial_param_offset                =  2
+
+  };
+
+  intptr_t ptr_at(int offset) const {
+    return *ptr_at_addr(offset);
+  }
+
+  void ptr_at_put(int offset, intptr_t value) {
+    *ptr_at_addr(offset) = value;
+  }
+
+ private:
+  // an additional field beyond _sp and _pc:
+  intptr_t*   _fp; // frame pointer
+  // The interpreter and adapters will extend the frame of the caller.
+  // Since oopMaps are based on the sp of the caller before extension
+  // we need to know that value. However in order to compute the address
+  // of the return address we need the real "raw" sp. Since sparc already
+  // uses sp() to mean "raw" sp and unextended_sp() to mean the caller's
+  // original sp we use that convention.
+
+  intptr_t*     _unextended_sp;
+  void adjust_unextended_sp();
+
+  intptr_t* ptr_at_addr(int offset) const {
+    return (intptr_t*) addr_at(offset);
+  }
+
+#ifdef ASSERT
+  // Used in frame::sender_for_{interpreter,compiled}_frame
+  static void verify_deopt_original_pc(   nmethod* nm, intptr_t* unextended_sp, bool is_method_handle_return = false);
+  static void verify_deopt_mh_original_pc(nmethod* nm, intptr_t* unextended_sp) {
+    verify_deopt_original_pc(nm, unextended_sp, true);
+  }
+#endif
+
+ public:
+  // Constructors
+
+  frame(intptr_t* sp, intptr_t* fp, address pc);
+
+  frame(intptr_t* sp, intptr_t* unextended_sp, intptr_t* fp, address pc);
+
+  frame(intptr_t* sp, intptr_t* fp);
+
+  void init(intptr_t* sp, intptr_t* fp, address pc);
+
+  // accessors for the instance variables
+  // Note: not necessarily the real 'frame pointer' (see real_fp)
+  intptr_t*   fp() const { return _fp; }
+
+  inline address* sender_pc_addr() const;
+
+  // return address of param, zero origin index.
+  inline address* native_param_addr(int idx) const;
+
+  // expression stack tos if we are nested in a java call
+  intptr_t* interpreter_frame_last_sp() const;
+
+  // helper to update a map with callee-saved RBP
+  static void update_map_with_saved_link(RegisterMap* map, intptr_t** link_addr);
+
+#ifndef CC_INTERP
+  // deoptimization support
+  void interpreter_frame_set_last_sp(intptr_t* sp);
+#endif // CC_INTERP
+
+#ifdef CC_INTERP
+  inline interpreterState get_interpreterState() const;
+#endif // CC_INTERP
+
+#endif // CPU_AARCH64_VM_FRAME_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/frame_aarch64.inline.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,348 @@
+/*
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_FRAME_AARCH64_INLINE_HPP
+#define CPU_AARCH64_VM_FRAME_AARCH64_INLINE_HPP
+
+#include "code/codeCache.hpp"
+#include "code/vmreg.inline.hpp"
+
+// Inline functions for AArch64 frames:
+
+// Constructors:
+
+inline frame::frame() {
+  _pc = NULL;
+  _sp = NULL;
+  _unextended_sp = NULL;
+  _fp = NULL;
+  _cb = NULL;
+  _deopt_state = unknown;
+}
+
+static int spin;
+
+inline void frame::init(intptr_t* sp, intptr_t* fp, address pc) {
+  intptr_t a = intptr_t(sp);
+  intptr_t b = intptr_t(fp);
+#ifndef PRODUCT
+  if (fp)
+    if (sp > fp || (fp - sp > 0x100000))
+      for(;;)
+        asm("nop");
+#endif
+  _sp = sp;
+  _unextended_sp = sp;
+  _fp = fp;
+  _pc = pc;
+  assert(pc != NULL, "no pc?");
+  _cb = CodeCache::find_blob(pc);
+  adjust_unextended_sp();
+
+  address original_pc = nmethod::get_deopt_original_pc(this);
+  if (original_pc != NULL) {
+    _pc = original_pc;
+    _deopt_state = is_deoptimized;
+  } else {
+    _deopt_state = not_deoptimized;
+  }
+}
+
+inline frame::frame(intptr_t* sp, intptr_t* fp, address pc) {
+  init(sp, fp, pc);
+}
+
+inline frame::frame(intptr_t* sp, intptr_t* unextended_sp, intptr_t* fp, address pc) {
+  intptr_t a = intptr_t(sp);
+  intptr_t b = intptr_t(fp);
+#ifndef PRODUCT
+  if (fp)
+    if (sp > fp || (fp - sp > 0x100000))
+      for(;;)
+        asm("nop");
+#endif
+  _sp = sp;
+  _unextended_sp = unextended_sp;
+  _fp = fp;
+  _pc = pc;
+  assert(pc != NULL, "no pc?");
+  _cb = CodeCache::find_blob(pc);
+  adjust_unextended_sp();
+
+  address original_pc = nmethod::get_deopt_original_pc(this);
+  if (original_pc != NULL) {
+    _pc = original_pc;
+    assert(((nmethod*)_cb)->insts_contains(_pc), "original PC must be in nmethod");
+    _deopt_state = is_deoptimized;
+  } else {
+    _deopt_state = not_deoptimized;
+  }
+}
+
+inline frame::frame(intptr_t* sp, intptr_t* fp) {
+  intptr_t a = intptr_t(sp);
+  intptr_t b = intptr_t(fp);
+#ifndef PRODUCT
+  if (fp)
+    if (sp > fp || (fp - sp > 0x100000))
+      for(;;)
+        asm("nop");
+#endif
+  _sp = sp;
+  _unextended_sp = sp;
+  _fp = fp;
+  _pc = (address)(sp[-1]);
+
+  // Here's a sticky one. This constructor can be called via AsyncGetCallTrace
+  // when last_Java_sp is non-null but the pc fetched is junk. If we are truly
+  // unlucky the junk value could be to a zombied method and we'll die on the
+  // find_blob call. This is also why we can have no asserts on the validity
+  // of the pc we find here. AsyncGetCallTrace -> pd_get_top_frame_for_signal_handler
+  // -> pd_last_frame should use a specialized version of pd_last_frame which could
+  // call a specilaized frame constructor instead of this one.
+  // Then we could use the assert below. However this assert is of somewhat dubious
+  // value.
+  // assert(_pc != NULL, "no pc?");
+
+  _cb = CodeCache::find_blob(_pc);
+  adjust_unextended_sp();
+
+  address original_pc = nmethod::get_deopt_original_pc(this);
+  if (original_pc != NULL) {
+    _pc = original_pc;
+    _deopt_state = is_deoptimized;
+  } else {
+    _deopt_state = not_deoptimized;
+  }
+}
+
+// Accessors
+
+inline bool frame::equal(frame other) const {
+  bool ret =  sp() == other.sp()
+              && unextended_sp() == other.unextended_sp()
+              && fp() == other.fp()
+              && pc() == other.pc();
+  assert(!ret || ret && cb() == other.cb() && _deopt_state == other._deopt_state, "inconsistent construction");
+  return ret;
+}
+
+// Return unique id for this frame. The id must have a value where we can distinguish
+// identity and younger/older relationship. NULL represents an invalid (incomparable)
+// frame.
+inline intptr_t* frame::id(void) const { return unextended_sp(); }
+
+// Relationals on frames based
+// Return true if the frame is younger (more recent activation) than the frame represented by id
+inline bool frame::is_younger(intptr_t* id) const { assert(this->id() != NULL && id != NULL, "NULL frame id");
+                                                    return this->id() < id ; }
+
+// Return true if the frame is older (less recent activation) than the frame represented by id
+inline bool frame::is_older(intptr_t* id) const   { assert(this->id() != NULL && id != NULL, "NULL frame id");
+                                                    return this->id() > id ; }
+
+
+
+inline intptr_t* frame::link() const              { return (intptr_t*) *(intptr_t **)addr_at(link_offset); }
+inline void      frame::set_link(intptr_t* addr)  { *(intptr_t **)addr_at(link_offset) = addr; }
+
+
+inline intptr_t* frame::unextended_sp() const     { return _unextended_sp; }
+
+// Return address:
+
+inline address* frame::sender_pc_addr()      const { return (address*) addr_at( return_addr_offset); }
+inline address  frame::sender_pc()           const { return *sender_pc_addr(); }
+
+// return address of param, zero origin index.
+inline address* frame::native_param_addr(int idx) const { return (address*) addr_at( native_frame_initial_param_offset+idx); }
+
+#ifdef CC_INTERP
+
+inline interpreterState frame::get_interpreterState() const {
+  return ((interpreterState)addr_at( -((int)sizeof(BytecodeInterpreter))/wordSize ));
+}
+
+inline intptr_t*    frame::sender_sp()        const {
+  // Hmm this seems awfully expensive QQQ, is this really called with interpreted frames?
+  if (is_interpreted_frame()) {
+    assert(false, "should never happen");
+    return get_interpreterState()->sender_sp();
+  } else {
+    return            addr_at(sender_sp_offset);
+  }
+}
+
+inline intptr_t** frame::interpreter_frame_locals_addr() const {
+  assert(is_interpreted_frame(), "must be interpreted");
+  return &(get_interpreterState()->_locals);
+}
+
+inline intptr_t* frame::interpreter_frame_bcx_addr() const {
+  assert(is_interpreted_frame(), "must be interpreted");
+  return (intptr_t*) &(get_interpreterState()->_bcp);
+}
+
+
+// Constant pool cache
+
+inline constantPoolCacheOop* frame::interpreter_frame_cache_addr() const {
+  assert(is_interpreted_frame(), "must be interpreted");
+  return &(get_interpreterState()->_constants);
+}
+
+// Method
+
+inline methodOop* frame::interpreter_frame_method_addr() const {
+  assert(is_interpreted_frame(), "must be interpreted");
+  return &(get_interpreterState()->_method);
+}
+
+inline intptr_t* frame::interpreter_frame_mdx_addr() const {
+  assert(is_interpreted_frame(), "must be interpreted");
+  return (intptr_t*) &(get_interpreterState()->_mdx);
+}
+
+// top of expression stack
+inline intptr_t* frame::interpreter_frame_tos_address() const {
+  assert(is_interpreted_frame(), "wrong frame type");
+  return get_interpreterState()->_stack + 1;
+}
+
+#else /* asm interpreter */
+inline intptr_t*    frame::sender_sp()        const { return            addr_at(   sender_sp_offset); }
+
+inline intptr_t** frame::interpreter_frame_locals_addr() const {
+  return (intptr_t**)addr_at(interpreter_frame_locals_offset);
+}
+
+inline intptr_t* frame::interpreter_frame_last_sp() const {
+  return *(intptr_t**)addr_at(interpreter_frame_last_sp_offset);
+}
+
+inline intptr_t* frame::interpreter_frame_bcp_addr() const {
+  return (intptr_t*)addr_at(interpreter_frame_bcp_offset);
+}
+
+inline intptr_t* frame::interpreter_frame_mdp_addr() const {
+  return (intptr_t*)addr_at(interpreter_frame_mdp_offset);
+}
+
+
+// Constant pool cache
+
+inline ConstantPoolCache** frame::interpreter_frame_cache_addr() const {
+  return (ConstantPoolCache**)addr_at(interpreter_frame_cache_offset);
+}
+
+// Method
+
+inline Method** frame::interpreter_frame_method_addr() const {
+  return (Method**)addr_at(interpreter_frame_method_offset);
+}
+
+// top of expression stack
+inline intptr_t* frame::interpreter_frame_tos_address() const {
+  intptr_t* last_sp = interpreter_frame_last_sp();
+  if (last_sp == NULL) {
+    return sp();
+  } else {
+    // sp() may have been extended or shrunk by an adapter.  At least
+    // check that we don't fall behind the legal region.
+    // For top deoptimized frame last_sp == interpreter_frame_monitor_end.
+    assert(last_sp <= (intptr_t*) interpreter_frame_monitor_end(), "bad tos");
+    return last_sp;
+  }
+}
+
+inline oop* frame::interpreter_frame_temp_oop_addr() const {
+  return (oop *)(fp() + interpreter_frame_oop_temp_offset);
+}
+
+#endif /* CC_INTERP */
+
+inline int frame::pd_oop_map_offset_adjustment() const {
+  return 0;
+}
+
+inline int frame::interpreter_frame_monitor_size() {
+  return BasicObjectLock::size();
+}
+
+
+// expression stack
+// (the max_stack arguments are used by the GC; see class FrameClosure)
+
+inline intptr_t* frame::interpreter_frame_expression_stack() const {
+  intptr_t* monitor_end = (intptr_t*) interpreter_frame_monitor_end();
+  return monitor_end-1;
+}
+
+
+inline jint frame::interpreter_frame_expression_stack_direction() { return -1; }
+
+
+// Entry frames
+
+inline JavaCallWrapper** frame::entry_frame_call_wrapper_addr() const {
+ return (JavaCallWrapper**)addr_at(entry_frame_call_wrapper_offset);
+}
+
+
+// Compiled frames
+
+inline int frame::local_offset_for_compiler(int local_index, int nof_args, int max_nof_locals, int max_nof_monitors) {
+  return (nof_args - local_index + (local_index < nof_args ? 1: -1));
+}
+
+inline int frame::monitor_offset_for_compiler(int local_index, int nof_args, int max_nof_locals, int max_nof_monitors) {
+  return local_offset_for_compiler(local_index, nof_args, max_nof_locals, max_nof_monitors);
+}
+
+inline int frame::min_local_offset_for_compiler(int nof_args, int max_nof_locals, int max_nof_monitors) {
+  return (nof_args - (max_nof_locals + max_nof_monitors*2) - 1);
+}
+
+inline bool frame::volatile_across_calls(Register reg) {
+  return true;
+}
+
+
+
+inline oop frame::saved_oop_result(RegisterMap* map) const {
+  oop* result_adr = (oop *)map->location(r0->as_VMReg());
+  guarantee(result_adr != NULL, "bad register save location");
+
+  return (*result_adr);
+}
+
+inline void frame::set_saved_oop_result(RegisterMap* map, oop obj) {
+  oop* result_adr = (oop *)map->location(r0->as_VMReg());
+  guarantee(result_adr != NULL, "bad register save location");
+
+  *result_adr = obj;
+}
+
+#endif // CPU_AARCH64_VM_FRAME_AARCH64_INLINE_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/globalDefinitions_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_GLOBALDEFINITIONS_AARCH64_HPP
+#define CPU_AARCH64_VM_GLOBALDEFINITIONS_AARCH64_HPP
+
+const int StackAlignmentInBytes  = 16;
+
+// Indicates whether the C calling conventions require that
+// 32-bit integer argument values are properly extended to 64 bits.
+// If set, SharedRuntime::c_calling_convention() must adapt
+// signatures accordingly.
+const bool CCallingConventionRequiresIntsAsLongs = true;
+
+#define SUPPORTS_NATIVE_CX8
+
+// The maximum B/BL offset range on AArch64 is 128MB.
+#undef CODE_CACHE_DEFAULT_LIMIT
+#define CODE_CACHE_DEFAULT_LIMIT (128*M)
+
+// According to the ARMv8 ARM, "Concurrent modification and execution
+// of instructions can lead to the resulting instruction performing
+// any behavior that can be achieved by executing any sequence of
+// instructions that can be executed from the same Exception level,
+// except where the instruction before modification and the
+// instruction after modification is a B, BL, NOP, BKPT, SVC, HVC, or
+// SMC instruction."
+//
+// This makes the games we play when patching difficult, so when we
+// come across an access that needs patching we deoptimize.  There are
+// ways we can avoid this, but these would slow down C1-compiled code
+// in the defauilt case.  We could revisit this decision if we get any
+// evidence that it's worth doing.
+#define DEOPTIMIZE_WHEN_PATCHING
+
+#endif // CPU_AARCH64_VM_GLOBALDEFINITIONS_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/globals_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2000, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_GLOBALS_AARCH64_HPP
+#define CPU_AARCH64_VM_GLOBALS_AARCH64_HPP
+
+#include "utilities/globalDefinitions.hpp"
+#include "utilities/macros.hpp"
+
+// Sets the default values for platform dependent flags used by the runtime system.
+// (see globals.hpp)
+
+define_pd_global(bool, ConvertSleepToYield,      true);
+define_pd_global(bool, ShareVtableStubs,         true);
+define_pd_global(bool, CountInterpCalls,         true);
+define_pd_global(bool, NeedsDeoptSuspend,        false); // only register window machines need this
+
+define_pd_global(bool, ImplicitNullChecks,       true);  // Generate code for implicit null checks
+define_pd_global(bool, TrapBasedNullChecks,  false);
+define_pd_global(bool, UncommonNullCast,         true);  // Uncommon-trap NULLs past to check cast
+
+// See 4827828 for this change. There is no globals_core_i486.hpp. I can't
+// assign a different value for C2 without touching a number of files. Use
+// #ifdef to minimize the change as it's late in Mantis. -- FIXME.
+// c1 doesn't have this problem because the fix to 4858033 assures us
+// the the vep is aligned at CodeEntryAlignment whereas c2 only aligns
+// the uep and the vep doesn't get real alignment but just slops on by
+// only assured that the entry instruction meets the 5 byte size requirement.
+#ifdef COMPILER2
+define_pd_global(intx, CodeEntryAlignment,       64);
+#else
+define_pd_global(intx, CodeEntryAlignment,       16);
+#endif // COMPILER2
+define_pd_global(intx, OptoLoopAlignment,        16);
+define_pd_global(intx, InlineFrequencyCount,     100);
+
+define_pd_global(intx, StackYellowPages, 2);
+define_pd_global(intx, StackRedPages, 1);
+
+define_pd_global(intx, StackShadowPages, 4 DEBUG_ONLY(+5));
+
+define_pd_global(intx, PreInflateSpin,           10);
+
+define_pd_global(bool, RewriteBytecodes,     true);
+define_pd_global(bool, RewriteFrequentPairs, false);
+
+define_pd_global(bool, UseMembar,            true);
+
+// GC Ergo Flags
+define_pd_global(uintx, CMSYoungGenPerWorker, 64*M);  // default max size of CMS young gen, per GC worker thread
+
+define_pd_global(uintx, TypeProfileLevel, 111);
+
+// avoid biased locking while we are bootstrapping the aarch64 build
+define_pd_global(bool, UseBiasedLocking, false);
+
+#if defined(COMPILER1) || defined(COMPILER2)
+define_pd_global(intx, InlineSmallCode,          1000);
+#endif
+
+#ifdef BUILTIN_SIM
+#define UseBuiltinSim           true
+#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct) \
+                                                                        \
+  product(bool, NotifySimulator, UseBuiltinSim,                         \
+         "tell the AArch64 sim where we are in method code")            \
+                                                                        \
+  product(bool, UseSimulatorCache, false,                               \
+         "tell sim to cache memory updates until exclusive op occurs")  \
+                                                                        \
+  product(bool, DisableBCCheck, true,                                   \
+          "tell sim not to invoke bccheck callback")                    \
+                                                                        \
+  product(bool, NearCpool, true,                                        \
+         "constant pool is close to instructions")                      \
+                                                                        \
+  notproduct(bool, UseAcqRelForVolatileFields, false,                   \
+             "Use acquire and release insns for volatile fields")       \
+                                                                        \
+  product(bool, UseCRC32, false,                                        \
+          "Use CRC32 instructions for CRC32 computation")               \
+
+// Don't attempt to use Neon on builtin sim until builtin sim supports it
+#define UseCRC32 false
+
+#else
+#define UseBuiltinSim           false
+#define NotifySimulator         false
+#define UseSimulatorCache       false
+#define DisableBCCheck          true
+#define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct) \
+                                                                        \
+  product(bool, NearCpool, true,                                        \
+         "constant pool is close to instructions")                      \
+                                                                        \
+  notproduct(bool, UseAcqRelForVolatileFields, false,                   \
+             "Use acquire and release insns for volatile fields")       \
+  product(bool, UseNeon, false,                                         \
+          "Use Neon for CRC32 computation")                             \
+  product(bool, UseCRC32, false,                                        \
+          "Use CRC32 instructions for CRC32 computation")               \
+  product(bool, TraceTraps, false, "Trace all traps the signal handler")
+
+#endif
+
+
+#endif // CPU_AARCH64_VM_GLOBALS_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/icBuffer_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "code/icBuffer.hpp"
+#include "gc_interface/collectedHeap.inline.hpp"
+#include "interpreter/bytecodes.hpp"
+#include "memory/resourceArea.hpp"
+#include "nativeInst_aarch64.hpp"
+#include "oops/oop.inline.hpp"
+#include "oops/oop.inline2.hpp"
+
+int InlineCacheBuffer::ic_stub_code_size() {
+  return (MacroAssembler::far_branches() ? 6 : 4) * NativeInstruction::instruction_size;
+}
+
+#define __ masm->
+
+void InlineCacheBuffer::assemble_ic_buffer_code(address code_begin, void* cached_value, address entry_point) {
+  ResourceMark rm;
+  CodeBuffer      code(code_begin, ic_stub_code_size());
+  MacroAssembler* masm            = new MacroAssembler(&code);
+  // note: even though the code contains an embedded value, we do not need reloc info
+  // because
+  // (1) the value is old (i.e., doesn't matter for scavenges)
+  // (2) these ICStubs are removed *before* a GC happens, so the roots disappear
+  // assert(cached_value == NULL || cached_oop->is_perm(), "must be perm oop");
+
+  address start = __ pc();
+  Label l;
+  __ ldr(rscratch2, l);
+  __ far_jump(ExternalAddress(entry_point));
+  __ align(wordSize);
+  __ bind(l);
+  __ emit_int64((int64_t)cached_value);
+  // Only need to invalidate the 1st two instructions - not the whole ic stub
+  ICache::invalidate_range(code_begin, InlineCacheBuffer::ic_stub_code_size());
+  assert(__ pc() - start == ic_stub_code_size(), "must be");
+}
+
+address InlineCacheBuffer::ic_buffer_entry_point(address code_begin) {
+  NativeMovConstReg* move = nativeMovConstReg_at(code_begin);   // creation also verifies the object
+  NativeJump* jump = nativeJump_at(code_begin + 4);
+  return jump->jump_destination();
+}
+
+
+void* InlineCacheBuffer::ic_buffer_cached_value(address code_begin) {
+  // The word containing the cached value is at the end of this IC buffer
+  uintptr_t *p = (uintptr_t *)(code_begin + ic_stub_code_size() - wordSize);
+  void* o = (void*)*p;
+  return o;
+}

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/icache_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "runtime/icache.hpp"
+
+extern void aarch64TestHook();
+
+void ICacheStubGenerator::generate_icache_flush(
+                ICache::flush_icache_stub_t* flush_icache_stub) {
+  // Give anyone who calls this a surprise
+  *flush_icache_stub = (ICache::flush_icache_stub_t)NULL;
+}
+
+void ICache::initialize() {
+  aarch64TestHook();
+}

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/icache_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_ICACHE_AARCH64_HPP
+#define CPU_AARCH64_VM_ICACHE_AARCH64_HPP
+
+// Interface for updating the instruction cache.  Whenever the VM
+// modifies code, part of the processor instruction cache potentially
+// has to be flushed.
+
+class ICache : public AbstractICache {
+ public:
+  static void initialize();
+  static void invalidate_word(address addr) {
+    __clear_cache((char *)addr, (char *)(addr + 3));
+  }
+  static void invalidate_range(address start, int nbytes) {
+    __clear_cache((char *)start, (char *)(start + nbytes));
+  }
+};
+
+#endif // CPU_AARCH64_VM_ICACHE_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/immediate_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include <stdlib.h>
+#include "decode_aarch64.hpp"
+#include "immediate_aarch64.hpp"
+
+// there are at most 2^13 possible logical immediate encodings
+// however, some combinations of immr and imms are invalid
+static const unsigned  LI_TABLE_SIZE = (1 << 13);
+
+static int li_table_entry_count;
+
+// for forward lookup we just use a direct array lookup
+// and assume that the cient has supplied a valid encoding
+// table[encoding] = immediate
+static u_int64_t LITable[LI_TABLE_SIZE];
+
+// for reverse lookup we need a sparse map so we store a table of
+// immediate and encoding pairs sorted by immediate value
+
+struct li_pair {
+  u_int64_t immediate;
+  u_int32_t encoding;
+};
+
+static struct li_pair InverseLITable[LI_TABLE_SIZE];
+
+// comparator to sort entries in the inverse table
+int compare_immediate_pair(const void *i1, const void *i2)
+{
+  struct li_pair *li1 = (struct li_pair *)i1;
+  struct li_pair *li2 = (struct li_pair *)i2;
+  if (li1->immediate < li2->immediate) {
+    return -1;
+  }
+  if (li1->immediate > li2->immediate) {
+    return 1;
+  }
+  return 0;
+}
+
+// helper functions used by expandLogicalImmediate
+
+// for i = 1, ... N result<i-1> = 1 other bits are zero
+static inline u_int64_t ones(int N)
+{
+  return (N == 64 ? (u_int64_t)-1UL : ((1UL << N) - 1));
+}
+
+// result<0> to val<N>
+static inline u_int64_t pickbit(u_int64_t val, int N)
+{
+  return pickbits64(val, N, N);
+}
+
+
+// SPEC bits(M*N) Replicate(bits(M) x, integer N);
+// this is just an educated guess
+
+u_int64_t replicate(u_int64_t bits, int nbits, int count)
+{
+  u_int64_t result = 0;
+  // nbits may be 64 in which case we want mask to be -1
+  u_int64_t mask = ones(nbits);
+  for (int i = 0; i < count ; i++) {
+    result <<= nbits;
+    result |= (bits & mask);
+  }
+  return result;
+}
+
+// this function writes the supplied bimm reference and returns a
+// boolean to indicate success (1) or fail (0) because an illegal
+// encoding must be treated as an UNALLOC instruction
+
+// construct a 32 bit immediate value for a logical immediate operation
+int expandLogicalImmediate(u_int32_t immN, u_int32_t immr,
+                            u_int32_t imms, u_int64_t &bimm)
+{
+  int len;                  // ought to be <= 6
+  u_int32_t levels;         // 6 bits
+  u_int32_t tmask_and;      // 6 bits
+  u_int32_t wmask_and;      // 6 bits
+  u_int32_t tmask_or;       // 6 bits
+  u_int32_t wmask_or;       // 6 bits
+  u_int64_t imm64;          // 64 bits
+  u_int64_t tmask, wmask;   // 64 bits
+  u_int32_t S, R, diff;     // 6 bits?
+
+  if (immN == 1) {
+    len = 6; // looks like 7 given the spec above but this cannot be!
+  } else {
+    len = 0;
+    u_int32_t val = (~imms & 0x3f);
+    for (int i = 5; i > 0; i--) {
+      if (val & (1 << i)) {
+        len = i;
+        break;
+      }
+    }
+    if (len < 1) {
+      return 0;
+    }
+    // for valid inputs leading 1s in immr must be less than leading
+    // zeros in imms
+    int len2 = 0;                   // ought to be < len
+    u_int32_t val2 = (~immr & 0x3f);
+    for (int i = 5; i > 0; i--) {
+      if (!(val2 & (1 << i))) {
+        len2 = i;
+        break;
+      }
+    }
+    if (len2 >= len) {
+      return 0;
+    }
+  }
+
+  levels = (1 << len) - 1;
+
+  if ((imms & levels) == levels) {
+    return 0;
+  }
+
+  S = imms & levels;
+  R = immr & levels;
+
+ // 6 bit arithmetic!
+  diff = S - R;
+  tmask_and = (diff | ~levels) & 0x3f;
+  tmask_or = (diff & levels) & 0x3f;
+  tmask = 0xffffffffffffffffULL;
+
+  for (int i = 0; i < 6; i++) {
+    int nbits = 1 << i;
+    u_int64_t and_bit = pickbit(tmask_and, i);
+    u_int64_t or_bit = pickbit(tmask_or, i);
+    u_int64_t and_bits_sub = replicate(and_bit, 1, nbits);
+    u_int64_t or_bits_sub = replicate(or_bit, 1, nbits);
+    u_int64_t and_bits_top = (and_bits_sub << nbits) | ones(nbits);
+    u_int64_t or_bits_top = (0 << nbits) | or_bits_sub;
+
+    tmask = ((tmask
+              & (replicate(and_bits_top, 2 * nbits, 32 / nbits)))
+             | replicate(or_bits_top, 2 * nbits, 32 / nbits));
+  }
+
+  wmask_and = (immr | ~levels) & 0x3f;
+  wmask_or = (immr & levels) & 0x3f;
+
+  wmask = 0;
+
+  for (int i = 0; i < 6; i++) {
+    int nbits = 1 << i;
+    u_int64_t and_bit = pickbit(wmask_and, i);
+    u_int64_t or_bit = pickbit(wmask_or, i);
+    u_int64_t and_bits_sub = replicate(and_bit, 1, nbits);
+    u_int64_t or_bits_sub = replicate(or_bit, 1, nbits);
+    u_int64_t and_bits_top = (ones(nbits) << nbits) | and_bits_sub;
+    u_int64_t or_bits_top = (or_bits_sub << nbits) | 0;
+
+    wmask = ((wmask
+              & (replicate(and_bits_top, 2 * nbits, 32 / nbits)))
+             | replicate(or_bits_top, 2 * nbits, 32 / nbits));
+  }
+
+  if (diff & (1U << 6)) {
+    imm64 = tmask & wmask;
+  } else {
+    imm64 = tmask | wmask;
+  }
+
+
+  bimm = imm64;
+  return 1;
+}
+
+// constructor to initialise the lookup tables
+
+static void initLITables() __attribute__ ((constructor));
+static void initLITables()
+{
+  li_table_entry_count = 0;
+  for (unsigned index = 0; index < LI_TABLE_SIZE; index++) {
+    u_int32_t N = uimm(index, 12, 12);
+    u_int32_t immr = uimm(index, 11, 6);
+    u_int32_t imms = uimm(index, 5, 0);
+    if (expandLogicalImmediate(N, immr, imms, LITable[index])) {
+      InverseLITable[li_table_entry_count].immediate = LITable[index];
+      InverseLITable[li_table_entry_count].encoding = index;
+      li_table_entry_count++;
+    }
+  }
+  // now sort the inverse table
+  qsort(InverseLITable, li_table_entry_count,
+        sizeof(InverseLITable[0]), compare_immediate_pair);
+}
+
+// public APIs provided for logical immediate lookup and reverse lookup
+
+u_int64_t logical_immediate_for_encoding(u_int32_t encoding)
+{
+  return LITable[encoding];
+}
+
+u_int32_t encoding_for_logical_immediate(u_int64_t immediate)
+{
+  struct li_pair pair;
+  struct li_pair *result;
+
+  pair.immediate = immediate;
+
+  result = (struct li_pair *)
+    bsearch(&pair, InverseLITable, li_table_entry_count,
+            sizeof(InverseLITable[0]), compare_immediate_pair);
+
+  if (result) {
+    return result->encoding;
+  }
+
+  return 0xffffffff;
+}
+
+// floating point immediates are encoded in 8 bits
+// fpimm[7] = sign bit
+// fpimm[6:4] = signed exponent
+// fpimm[3:0] = fraction (assuming leading 1)
+// i.e. F = s * 1.f * 2^(e - b)
+
+u_int64_t fp_immediate_for_encoding(u_int32_t imm8, int is_dp)
+{
+  union {
+    float fpval;
+    double dpval;
+    u_int64_t val;
+  };
+
+  u_int32_t s, e, f;
+  s = (imm8 >> 7 ) & 0x1;
+  e = (imm8 >> 4) & 0x7;
+  f = imm8 & 0xf;
+  // the fp value is s * n/16 * 2r where n is 16+e
+  fpval = (16.0 + f) / 16.0;
+  // n.b. exponent is signed
+  if (e < 4) {
+    int epos = e;
+    for (int i = 0; i <= epos; i++) {
+      fpval *= 2.0;
+    }
+  } else {
+    int eneg = 7 - e;
+    for (int i = 0; i < eneg; i++) {
+      fpval /= 2.0;
+    }
+  }
+
+  if (s) {
+    fpval = -fpval;
+  }
+  if (is_dp) {
+    dpval = (double)fpval;
+  }
+  return val;
+}
+
+u_int32_t encoding_for_fp_immediate(float immediate)
+{
+  // given a float which is of the form
+  //
+  //     s * n/16 * 2r
+  //
+  // where n is 16+f and imm1:s, imm4:f, simm3:r
+  // return the imm8 result [s:r:f]
+  //
+
+  union {
+    float fpval;
+    u_int32_t val;
+  };
+  fpval = immediate;
+  u_int32_t s, r, f, res;
+  // sign bit is 31
+  s = (val >> 31) & 0x1;
+  // exponent is bits 30-23 but we only want the bottom 3 bits
+  // strictly we ought to check that the bits bits 30-25 are
+  // either all 1s or all 0s
+  r = (val >> 23) & 0x7;
+  // fraction is bits 22-0
+  f = (val >> 19) & 0xf;
+  res = (s << 7) | (r << 4) | f;
+  return res;
+}
+

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/immediate_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef _IMMEDIATE_H
+#define _IMMEDIATE_H
+
+#include <sys/types.h>
+
+/*
+ * functions to map backwards and forwards between logical or floating
+ * point immediates and their corresponding encodings. the mapping
+ * from encoding to immediate is required by the simulator. the reverse
+ * mapping is required by the OpenJDK assembler.
+ *
+ * a logical immediate value supplied to or returned from a map lookup
+ * is always 64 bits. this is sufficient for looking up 32 bit
+ * immediates or their encodings since a 32 bit immediate has the same
+ * encoding as the 64 bit immediate produced by concatenating the
+ * immediate with itself.
+ *
+ * a logical immediate encoding is 13 bits N:immr:imms (3 fields of
+ * widths 1:6:6 -- see the arm spec). they appear as bits [22:10] of a
+ * logical immediate instruction. encodings are supplied and returned
+ * as 32 bit values. if a given 13 bit immediate has no corresponding
+ * encoding then a map lookup will return 0xffffffff.
+ */
+
+u_int64_t logical_immediate_for_encoding(u_int32_t encoding);
+u_int32_t encoding_for_logical_immediate(u_int64_t immediate);
+u_int64_t fp_immediate_for_encoding(u_int32_t imm8, int is_dp);
+u_int32_t encoding_for_fp_immediate(float immediate);
+
+#endif // _IMMEDIATE_H

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/interp_masm_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,1682 @@
+/*
+ * Copyright (c) 2003, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "interp_masm_aarch64.hpp"
+#include "interpreter/interpreter.hpp"
+#include "interpreter/interpreterRuntime.hpp"
+#include "oops/arrayOop.hpp"
+#include "oops/markOop.hpp"
+#include "oops/methodData.hpp"
+#include "oops/method.hpp"
+#include "prims/jvmtiExport.hpp"
+#include "prims/jvmtiRedefineClassesTrace.hpp"
+#include "prims/jvmtiThreadState.hpp"
+#include "runtime/basicLock.hpp"
+#include "runtime/biasedLocking.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/thread.inline.hpp"
+
+
+// Implementation of InterpreterMacroAssembler
+
+#ifndef CC_INTERP
+
+void InterpreterMacroAssembler::check_and_handle_popframe(Register java_thread) {
+  if (JvmtiExport::can_pop_frame()) {
+    Label L;
+    // Initiate popframe handling only if it is not already being
+    // processed.  If the flag has the popframe_processing bit set, it
+    // means that this code is called *during* popframe handling - we
+    // don't want to reenter.
+    // This method is only called just after the call into the vm in
+    // call_VM_base, so the arg registers are available.
+    ldrw(rscratch1, Address(rthread, JavaThread::popframe_condition_offset()));
+    tstw(rscratch1, JavaThread::popframe_pending_bit);
+    br(Assembler::EQ, L);
+    tstw(rscratch1, JavaThread::popframe_processing_bit);
+    br(Assembler::NE, L);
+    // Call Interpreter::remove_activation_preserving_args_entry() to get the
+    // address of the same-named entrypoint in the generated interpreter code.
+    call_VM_leaf(CAST_FROM_FN_PTR(address, Interpreter::remove_activation_preserving_args_entry));
+    br(r0);
+    bind(L);
+  }
+}
+
+
+void InterpreterMacroAssembler::load_earlyret_value(TosState state) {
+  ldr(r2, Address(rthread, JavaThread::jvmti_thread_state_offset()));
+  const Address tos_addr(r2, JvmtiThreadState::earlyret_tos_offset());
+  const Address oop_addr(r2, JvmtiThreadState::earlyret_oop_offset());
+  const Address val_addr(r2, JvmtiThreadState::earlyret_value_offset());
+  switch (state) {
+    case atos: ldr(r0, oop_addr);
+               str(zr, oop_addr);
+               verify_oop(r0, state);               break;
+    case ltos: ldr(r0, val_addr);                   break;
+    case btos:                                   // fall through
+    case ctos:                                   // fall through
+    case stos:                                   // fall through
+    case itos: ldrw(r0, val_addr);                  break;
+    case ftos: ldrs(v0, val_addr);                  break;
+    case dtos: ldrd(v0, val_addr);                  break;
+    case vtos: /* nothing to do */                  break;
+    default  : ShouldNotReachHere();
+  }
+  // Clean up tos value in the thread object
+  movw(rscratch1, (int) ilgl);
+  strw(rscratch1, tos_addr);
+  strw(zr, val_addr);
+}
+
+
+void InterpreterMacroAssembler::check_and_handle_earlyret(Register java_thread) {
+  if (JvmtiExport::can_force_early_return()) {
+    Label L;
+    ldr(rscratch1, Address(rthread, JavaThread::jvmti_thread_state_offset()));
+    cbz(rscratch1, L); // if (thread->jvmti_thread_state() == NULL) exit;
+
+    // Initiate earlyret handling only if it is not already being processed.
+    // If the flag has the earlyret_processing bit set, it means that this code
+    // is called *during* earlyret handling - we don't want to reenter.
+    ldrw(rscratch1, Address(rscratch1, JvmtiThreadState::earlyret_state_offset()));
+    cmpw(rscratch1, JvmtiThreadState::earlyret_pending);
+    br(Assembler::NE, L);
+
+    // Call Interpreter::remove_activation_early_entry() to get the address of the
+    // same-named entrypoint in the generated interpreter code.
+    ldr(rscratch1, Address(rthread, JavaThread::jvmti_thread_state_offset()));
+    ldrw(rscratch1, Address(rscratch1, JvmtiThreadState::earlyret_tos_offset()));
+    call_VM_leaf(CAST_FROM_FN_PTR(address, Interpreter::remove_activation_early_entry), rscratch1);
+    br(r0);
+    bind(L);
+  }
+}
+
+void InterpreterMacroAssembler::get_unsigned_2_byte_index_at_bcp(
+  Register reg,
+  int bcp_offset) {
+  assert(bcp_offset >= 0, "bcp is still pointing to start of bytecode");
+  ldrh(reg, Address(rbcp, bcp_offset));
+  rev16(reg, reg);
+}
+
+void InterpreterMacroAssembler::get_dispatch() {
+  unsigned long offset;
+  adrp(rdispatch, ExternalAddress((address)Interpreter::dispatch_table()), offset);
+  lea(rdispatch, Address(rdispatch, offset));
+}
+
+void InterpreterMacroAssembler::get_cache_index_at_bcp(Register index,
+                                                       int bcp_offset,
+                                                       size_t index_size) {
+  assert(bcp_offset > 0, "bcp is still pointing to start of bytecode");
+  if (index_size == sizeof(u2)) {
+    load_unsigned_short(index, Address(rbcp, bcp_offset));
+  } else if (index_size == sizeof(u4)) {
+    // assert(EnableInvokeDynamic, "giant index used only for JSR 292");
+    ldrw(index, Address(rbcp, bcp_offset));
+    // Check if the secondary index definition is still ~x, otherwise
+    // we have to change the following assembler code to calculate the
+    // plain index.
+    assert(ConstantPool::decode_invokedynamic_index(~123) == 123, "else change next line");
+    eonw(index, index, zr);  // convert to plain index
+  } else if (index_size == sizeof(u1)) {
+    load_unsigned_byte(index, Address(rbcp, bcp_offset));
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+// Return
+// Rindex: index into constant pool
+// Rcache: address of cache entry - ConstantPoolCache::base_offset()
+//
+// A caller must add ConstantPoolCache::base_offset() to Rcache to get
+// the true address of the cache entry.
+//
+void InterpreterMacroAssembler::get_cache_and_index_at_bcp(Register cache,
+                                                           Register index,
+                                                           int bcp_offset,
+                                                           size_t index_size) {
+  assert_different_registers(cache, index);
+  assert_different_registers(cache, rcpool);
+  get_cache_index_at_bcp(index, bcp_offset, index_size);
+  assert(sizeof(ConstantPoolCacheEntry) == 4 * wordSize, "adjust code below");
+  // convert from field index to ConstantPoolCacheEntry
+  // aarch64 already has the cache in rcpool so there is no need to
+  // install it in cache. instead we pre-add the indexed offset to
+  // rcpool and return it in cache. All clients of this method need to
+  // be modified accordingly.
+  add(cache, rcpool, index, Assembler::LSL, 5);
+}
+
+
+void InterpreterMacroAssembler::get_cache_and_index_and_bytecode_at_bcp(Register cache,
+                                                                        Register index,
+                                                                        Register bytecode,
+                                                                        int byte_no,
+                                                                        int bcp_offset,
+                                                                        size_t index_size) {
+  get_cache_and_index_at_bcp(cache, index, bcp_offset, index_size);
+  // We use a 32-bit load here since the layout of 64-bit words on
+  // little-endian machines allow us that.
+  // n.b. unlike x86 cache alreeady includes the index offset
+  ldrw(bytecode, Address(cache,
+                         ConstantPoolCache::base_offset()
+                         + ConstantPoolCacheEntry::indices_offset()));
+  const int shift_count = (1 + byte_no) * BitsPerByte;
+  ubfx(bytecode, bytecode, shift_count, BitsPerByte);
+}
+
+void InterpreterMacroAssembler::get_cache_entry_pointer_at_bcp(Register cache,
+                                                               Register tmp,
+                                                               int bcp_offset,
+                                                               size_t index_size) {
+  assert(cache != tmp, "must use different register");
+  get_cache_index_at_bcp(tmp, bcp_offset, index_size);
+  assert(sizeof(ConstantPoolCacheEntry) == 4 * wordSize, "adjust code below");
+  // convert from field index to ConstantPoolCacheEntry index
+  // and from word offset to byte offset
+  assert(exact_log2(in_bytes(ConstantPoolCacheEntry::size_in_bytes())) == 2 + LogBytesPerWord, "else change next line");
+  ldr(cache, Address(rfp, frame::interpreter_frame_cache_offset * wordSize));
+  // skip past the header
+  add(cache, cache, in_bytes(ConstantPoolCache::base_offset()));
+  add(cache, cache, tmp, Assembler::LSL, 2 + LogBytesPerWord);  // construct pointer to cache entry
+}
+
+void InterpreterMacroAssembler::get_method_counters(Register method,
+                                                    Register mcs, Label& skip) {
+  Label has_counters;
+  ldr(mcs, Address(method, Method::method_counters_offset()));
+  cbnz(mcs, has_counters);
+  call_VM(noreg, CAST_FROM_FN_PTR(address,
+          InterpreterRuntime::build_method_counters), method);
+  ldr(mcs, Address(method, Method::method_counters_offset()));
+  cbz(mcs, skip); // No MethodCounters allocated, OutOfMemory
+  bind(has_counters);
+}
+
+// Load object from cpool->resolved_references(index)
+void InterpreterMacroAssembler::load_resolved_reference_at_index(
+                                           Register result, Register index) {
+  assert_different_registers(result, index);
+  // convert from field index to resolved_references() index and from
+  // word index to byte offset. Since this is a java object, it can be compressed
+  Register tmp = index;  // reuse
+  lslw(tmp, tmp, LogBytesPerHeapOop);
+
+  get_constant_pool(result);
+  // load pointer for resolved_references[] objArray
+  ldr(result, Address(result, ConstantPool::resolved_references_offset_in_bytes()));
+  // JNIHandles::resolve(obj);
+  ldr(result, Address(result, 0));
+  // Add in the index
+  add(result, result, tmp);
+  load_heap_oop(result, Address(result, arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
+}
+
+// Generate a subtype check: branch to ok_is_subtype if sub_klass is a
+// subtype of super_klass.
+//
+// Args:
+//      r0: superklass
+//      Rsub_klass: subklass
+//
+// Kills:
+//      r2, r5
+void InterpreterMacroAssembler::gen_subtype_check(Register Rsub_klass,
+                                                  Label& ok_is_subtype) {
+  assert(Rsub_klass != r0, "r0 holds superklass");
+  assert(Rsub_klass != r2, "r2 holds 2ndary super array length");
+  assert(Rsub_klass != r5, "r5 holds 2ndary super array scan ptr");
+
+  // Profile the not-null value's klass.
+  profile_typecheck(r2, Rsub_klass, r5); // blows r2, reloads r5
+
+  // Do the check.
+  check_klass_subtype(Rsub_klass, r0, r2, ok_is_subtype); // blows r2
+
+  // Profile the failure of the check.
+  profile_typecheck_failed(r2); // blows r2
+}
+
+// Java Expression Stack
+
+void InterpreterMacroAssembler::pop_ptr(Register r) {
+  ldr(r, post(esp, wordSize));
+}
+
+void InterpreterMacroAssembler::pop_i(Register r) {
+  ldrw(r, post(esp, wordSize));
+}
+
+void InterpreterMacroAssembler::pop_l(Register r) {
+  ldr(r, post(esp, 2 * Interpreter::stackElementSize));
+}
+
+void InterpreterMacroAssembler::push_ptr(Register r) {
+  str(r, pre(esp, -wordSize));
+ }
+
+void InterpreterMacroAssembler::push_i(Register r) {
+  str(r, pre(esp, -wordSize));
+}
+
+void InterpreterMacroAssembler::push_l(Register r) {
+  str(r, pre(esp, 2 * -wordSize));
+}
+
+void InterpreterMacroAssembler::pop_f(FloatRegister r) {
+  ldrs(r, post(esp, wordSize));
+}
+
+void InterpreterMacroAssembler::pop_d(FloatRegister r) {
+  ldrd(r, post(esp, 2 * Interpreter::stackElementSize));
+}
+
+void InterpreterMacroAssembler::push_f(FloatRegister r) {
+  strs(r, pre(esp, -wordSize));
+}
+
+void InterpreterMacroAssembler::push_d(FloatRegister r) {
+  strd(r, pre(esp, 2* -wordSize));
+}
+
+void InterpreterMacroAssembler::pop(TosState state) {
+  switch (state) {
+  case atos: pop_ptr();                 break;
+  case btos:
+  case ctos:
+  case stos:
+  case itos: pop_i();                   break;
+  case ltos: pop_l();                   break;
+  case ftos: pop_f();                   break;
+  case dtos: pop_d();                   break;
+  case vtos: /* nothing to do */        break;
+  default:   ShouldNotReachHere();
+  }
+  verify_oop(r0, state);
+}
+
+void InterpreterMacroAssembler::push(TosState state) {
+  verify_oop(r0, state);
+  switch (state) {
+  case atos: push_ptr();                break;
+  case btos:
+  case ctos:
+  case stos:
+  case itos: push_i();                  break;
+  case ltos: push_l();                  break;
+  case ftos: push_f();                  break;
+  case dtos: push_d();                  break;
+  case vtos: /* nothing to do */        break;
+  default  : ShouldNotReachHere();
+  }
+}
+
+// Helpers for swap and dup
+void InterpreterMacroAssembler::load_ptr(int n, Register val) {
+  ldr(val, Address(esp, Interpreter::expr_offset_in_bytes(n)));
+}
+
+void InterpreterMacroAssembler::store_ptr(int n, Register val) {
+  str(val, Address(esp, Interpreter::expr_offset_in_bytes(n)));
+}
+
+
+void InterpreterMacroAssembler::prepare_to_jump_from_interpreted() {
+  // set sender sp
+  mov(r13, sp);
+  // record last_sp
+  str(esp, Address(rfp, frame::interpreter_frame_last_sp_offset * wordSize));
+}
+
+// Jump to from_interpreted entry of a call unless single stepping is possible
+// in this thread in which case we must call the i2i entry
+void InterpreterMacroAssembler::jump_from_interpreted(Register method, Register temp) {
+  prepare_to_jump_from_interpreted();
+
+  if (JvmtiExport::can_post_interpreter_events()) {
+    Label run_compiled_code;
+    // JVMTI events, such as single-stepping, are implemented partly by avoiding running
+    // compiled code in threads for which the event is enabled.  Check here for
+    // interp_only_mode if these events CAN be enabled.
+    // interp_only is an int, on little endian it is sufficient to test the byte only
+    // Is a cmpl faster?
+    ldr(rscratch1, Address(rthread, JavaThread::interp_only_mode_offset()));
+    cbz(rscratch1, run_compiled_code);
+    ldr(rscratch1, Address(method, Method::interpreter_entry_offset()));
+    br(rscratch1);
+    bind(run_compiled_code);
+  }
+
+  ldr(rscratch1, Address(method, Method::from_interpreted_offset()));
+  br(rscratch1);
+}
+
+// The following two routines provide a hook so that an implementation
+// can schedule the dispatch in two parts.  amd64 does not do this.
+void InterpreterMacroAssembler::dispatch_prolog(TosState state, int step) {
+}
+
+void InterpreterMacroAssembler::dispatch_epilog(TosState state, int step) {
+    dispatch_next(state, step);
+}
+
+void InterpreterMacroAssembler::dispatch_base(TosState state,
+                                              address* table,
+                                              bool verifyoop) {
+  if (VerifyActivationFrameSize) {
+    Unimplemented();
+  }
+  if (verifyoop) {
+    verify_oop(r0, state);
+  }
+  if (table == Interpreter::dispatch_table(state)) {
+    addw(rscratch2, rscratch1, Interpreter::distance_from_dispatch_table(state));
+    ldr(rscratch2, Address(rdispatch, rscratch2, Address::uxtw(3)));
+  } else {
+    mov(rscratch2, (address)table);
+    ldr(rscratch2, Address(rscratch2, rscratch1, Address::uxtw(3)));
+  }
+  br(rscratch2);
+}
+
+void InterpreterMacroAssembler::dispatch_only(TosState state) {
+  dispatch_base(state, Interpreter::dispatch_table(state));
+}
+
+void InterpreterMacroAssembler::dispatch_only_normal(TosState state) {
+  dispatch_base(state, Interpreter::normal_table(state));
+}
+
+void InterpreterMacroAssembler::dispatch_only_noverify(TosState state) {
+  dispatch_base(state, Interpreter::normal_table(state), false);
+}
+
+
+void InterpreterMacroAssembler::dispatch_next(TosState state, int step) {
+  // load next bytecode
+  ldrb(rscratch1, Address(pre(rbcp, step)));
+  dispatch_base(state, Interpreter::dispatch_table(state));
+}
+
+void InterpreterMacroAssembler::dispatch_via(TosState state, address* table) {
+  // load current bytecode
+  ldrb(rscratch1, Address(rbcp, 0));
+  dispatch_base(state, table);
+}
+
+// remove activation
+//
+// Unlock the receiver if this is a synchronized method.
+// Unlock any Java monitors from syncronized blocks.
+// Remove the activation from the stack.
+//
+// If there are locked Java monitors
+//    If throw_monitor_exception
+//       throws IllegalMonitorStateException
+//    Else if install_monitor_exception
+//       installs IllegalMonitorStateException
+//    Else
+//       no error processing
+void InterpreterMacroAssembler::remove_activation(
+        TosState state,
+        bool throw_monitor_exception,
+        bool install_monitor_exception,
+        bool notify_jvmdi) {
+  // Note: Registers r3 xmm0 may be in use for the
+  // result check if synchronized method
+  Label unlocked, unlock, no_unlock;
+
+  // get the value of _do_not_unlock_if_synchronized into r3
+  const Address do_not_unlock_if_synchronized(rthread,
+    in_bytes(JavaThread::do_not_unlock_if_synchronized_offset()));
+  ldrb(r3, do_not_unlock_if_synchronized);
+  strb(zr, do_not_unlock_if_synchronized); // reset the flag
+
+ // get method access flags
+  ldr(r1, Address(rfp, frame::interpreter_frame_method_offset * wordSize));
+  ldr(r2, Address(r1, Method::access_flags_offset()));
+  tst(r2, JVM_ACC_SYNCHRONIZED);
+  br(Assembler::EQ, unlocked);
+
+  // Don't unlock anything if the _do_not_unlock_if_synchronized flag
+  // is set.
+  cbnz(r3, no_unlock);
+
+  // unlock monitor
+  push(state); // save result
+
+  // BasicObjectLock will be first in list, since this is a
+  // synchronized method. However, need to check that the object has
+  // not been unlocked by an explicit monitorexit bytecode.
+  const Address monitor(rfp, frame::interpreter_frame_initial_sp_offset *
+                        wordSize - (int) sizeof(BasicObjectLock));
+  // We use c_rarg1 so that if we go slow path it will be the correct
+  // register for unlock_object to pass to VM directly
+  lea(c_rarg1, monitor); // address of first monitor
+
+  ldr(r0, Address(c_rarg1, BasicObjectLock::obj_offset_in_bytes()));
+  cbnz(r0, unlock);
+
+  pop(state);
+  if (throw_monitor_exception) {
+    // Entry already unlocked, need to throw exception
+    call_VM(noreg, CAST_FROM_FN_PTR(address,
+                   InterpreterRuntime::throw_illegal_monitor_state_exception));
+    should_not_reach_here();
+  } else {
+    // Monitor already unlocked during a stack unroll. If requested,
+    // install an illegal_monitor_state_exception.  Continue with
+    // stack unrolling.
+    if (install_monitor_exception) {
+      call_VM(noreg, CAST_FROM_FN_PTR(address,
+                     InterpreterRuntime::new_illegal_monitor_state_exception));
+    }
+    b(unlocked);
+  }
+
+  bind(unlock);
+  unlock_object(c_rarg1);
+  pop(state);
+
+  // Check that for block-structured locking (i.e., that all locked
+  // objects has been unlocked)
+  bind(unlocked);
+
+  // r0: Might contain return value
+
+  // Check that all monitors are unlocked
+  {
+    Label loop, exception, entry, restart;
+    const int entry_size = frame::interpreter_frame_monitor_size() * wordSize;
+    const Address monitor_block_top(
+        rfp, frame::interpreter_frame_monitor_block_top_offset * wordSize);
+    const Address monitor_block_bot(
+        rfp, frame::interpreter_frame_initial_sp_offset * wordSize);
+
+    bind(restart);
+    // We use c_rarg1 so that if we go slow path it will be the correct
+    // register for unlock_object to pass to VM directly
+    ldr(c_rarg1, monitor_block_top); // points to current entry, starting
+                                     // with top-most entry
+    lea(r19, monitor_block_bot);  // points to word before bottom of
+                                  // monitor block
+    b(entry);
+
+    // Entry already locked, need to throw exception
+    bind(exception);
+
+    if (throw_monitor_exception) {
+      // Throw exception
+      MacroAssembler::call_VM(noreg,
+                              CAST_FROM_FN_PTR(address, InterpreterRuntime::
+                                   throw_illegal_monitor_state_exception));
+      should_not_reach_here();
+    } else {
+      // Stack unrolling. Unlock object and install illegal_monitor_exception.
+      // Unlock does not block, so don't have to worry about the frame.
+      // We don't have to preserve c_rarg1 since we are going to throw an exception.
+
+      push(state);
+      unlock_object(c_rarg1);
+      pop(state);
+
+      if (install_monitor_exception) {
+        call_VM(noreg, CAST_FROM_FN_PTR(address,
+                                        InterpreterRuntime::
+                                        new_illegal_monitor_state_exception));
+      }
+
+      b(restart);
+    }
+
+    bind(loop);
+    // check if current entry is used
+    ldr(rscratch1, Address(c_rarg1, BasicObjectLock::obj_offset_in_bytes()));
+    cbnz(rscratch1, exception);
+
+    add(c_rarg1, c_rarg1, entry_size); // otherwise advance to next entry
+    bind(entry);
+    cmp(c_rarg1, r19); // check if bottom reached
+    br(Assembler::NE, loop); // if not at bottom then check this entry
+  }
+
+  bind(no_unlock);
+
+  // jvmti support
+  if (notify_jvmdi) {
+    notify_method_exit(state, NotifyJVMTI);    // preserve TOSCA
+  } else {
+    notify_method_exit(state, SkipNotifyJVMTI); // preserve TOSCA
+  }
+
+  // remove activation
+  // get sender esp
+  ldr(esp,
+      Address(rfp, frame::interpreter_frame_sender_sp_offset * wordSize));
+  // remove frame anchor
+  leave();
+  // If we're returning to interpreted code we will shortly be
+  // adjusting SP to allow some space for ESP.  If we're returning to
+  // compiled code the saved sender SP was saved in sender_sp, so this
+  // restores it.
+  andr(sp, esp, -16);
+}
+
+#endif // C_INTERP
+
+// Lock object
+//
+// Args:
+//      c_rarg1: BasicObjectLock to be used for locking
+//
+// Kills:
+//      r0
+//      c_rarg0, c_rarg1, c_rarg2, c_rarg3, .. (param regs)
+//      rscratch1, rscratch2 (scratch regs)
+void InterpreterMacroAssembler::lock_object(Register lock_reg)
+{
+  assert(lock_reg == c_rarg1, "The argument is only for looks. It must be c_rarg1");
+  if (UseHeavyMonitors) {
+    call_VM(noreg,
+            CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorenter),
+            lock_reg);
+  } else {
+    Label done;
+
+    const Register swap_reg = r0;
+    const Register obj_reg = c_rarg3; // Will contain the oop
+
+    const int obj_offset = BasicObjectLock::obj_offset_in_bytes();
+    const int lock_offset = BasicObjectLock::lock_offset_in_bytes ();
+    const int mark_offset = lock_offset +
+                            BasicLock::displaced_header_offset_in_bytes();
+
+    Label slow_case;
+
+    // Load object pointer into obj_reg %c_rarg3
+    ldr(obj_reg, Address(lock_reg, obj_offset));
+
+    if (UseBiasedLocking) {
+      biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch2, false, done, &slow_case);
+    }
+
+    // Load (object->mark() | 1) into swap_reg
+    ldr(rscratch1, Address(obj_reg, 0));
+    orr(swap_reg, rscratch1, 1);
+
+    // Save (object->mark() | 1) into BasicLock's displaced header
+    str(swap_reg, Address(lock_reg, mark_offset));
+
+    assert(lock_offset == 0,
+           "displached header must be first word in BasicObjectLock");
+
+    Label fail;
+    if (PrintBiasedLockingStatistics) {
+      Label fast;
+      cmpxchgptr(swap_reg, lock_reg, obj_reg, rscratch1, fast, &fail);
+      bind(fast);
+      atomic_incw(Address((address)BiasedLocking::fast_path_entry_count_addr()),
+                  rscratch2, rscratch1);
+      b(done);
+      bind(fail);
+    } else {
+      cmpxchgptr(swap_reg, lock_reg, obj_reg, rscratch1, done, /*fallthrough*/NULL);
+    }
+
+    // Test if the oopMark is an obvious stack pointer, i.e.,
+    //  1) (mark & 7) == 0, and
+    //  2) rsp <= mark < mark + os::pagesize()
+    //
+    // These 3 tests can be done by evaluating the following
+    // expression: ((mark - rsp) & (7 - os::vm_page_size())),
+    // assuming both stack pointer and pagesize have their
+    // least significant 3 bits clear.
+    // NOTE: the oopMark is in swap_reg %r0 as the result of cmpxchg
+    // NOTE2: aarch64 does not like to subtract sp from rn so take a
+    // copy
+    mov(rscratch1, sp);
+    sub(swap_reg, swap_reg, rscratch1);
+    ands(swap_reg, swap_reg, (unsigned long)(7 - os::vm_page_size()));
+
+    // Save the test result, for recursive case, the result is zero
+    str(swap_reg, Address(lock_reg, mark_offset));
+
+    if (PrintBiasedLockingStatistics) {
+      br(Assembler::NE, slow_case);
+      atomic_incw(Address((address)BiasedLocking::fast_path_entry_count_addr()),
+                  rscratch2, rscratch1);
+    }
+    br(Assembler::EQ, done);
+
+    bind(slow_case);
+
+    // Call the runtime routine for slow case
+    call_VM(noreg,
+            CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorenter),
+            lock_reg);
+
+    bind(done);
+  }
+}
+
+
+// Unlocks an object. Used in monitorexit bytecode and
+// remove_activation.  Throws an IllegalMonitorException if object is
+// not locked by current thread.
+//
+// Args:
+//      c_rarg1: BasicObjectLock for lock
+//
+// Kills:
+//      r0
+//      c_rarg0, c_rarg1, c_rarg2, c_rarg3, ... (param regs)
+//      rscratch1, rscratch2 (scratch regs)
+void InterpreterMacroAssembler::unlock_object(Register lock_reg)
+{
+  assert(lock_reg == c_rarg1, "The argument is only for looks. It must be rarg1");
+
+  if (UseHeavyMonitors) {
+    call_VM(noreg,
+            CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorexit),
+            lock_reg);
+  } else {
+    Label done;
+
+    const Register swap_reg   = r0;
+    const Register header_reg = c_rarg2;  // Will contain the old oopMark
+    const Register obj_reg    = c_rarg3;  // Will contain the oop
+
+    save_bcp(); // Save in case of exception
+
+    // Convert from BasicObjectLock structure to object and BasicLock
+    // structure Store the BasicLock address into %r0
+    lea(swap_reg, Address(lock_reg, BasicObjectLock::lock_offset_in_bytes()));
+
+    // Load oop into obj_reg(%c_rarg3)
+    ldr(obj_reg, Address(lock_reg, BasicObjectLock::obj_offset_in_bytes()));
+
+    // Free entry
+    str(zr, Address(lock_reg, BasicObjectLock::obj_offset_in_bytes()));
+
+    if (UseBiasedLocking) {
+      biased_locking_exit(obj_reg, header_reg, done);
+    }
+
+    // Load the old header from BasicLock structure
+    ldr(header_reg, Address(swap_reg,
+                            BasicLock::displaced_header_offset_in_bytes()));
+
+    // Test for recursion
+    cbz(header_reg, done);
+
+    // Atomic swap back the old header
+    cmpxchgptr(swap_reg, header_reg, obj_reg, rscratch1, done, /*fallthrough*/NULL);
+
+    // Call the runtime routine for slow case.
+    str(obj_reg, Address(lock_reg, BasicObjectLock::obj_offset_in_bytes())); // restore obj
+    call_VM(noreg,
+            CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorexit),
+            lock_reg);
+
+    bind(done);
+
+    restore_bcp();
+  }
+}
+
+#ifndef CC_INTERP
+
+void InterpreterMacroAssembler::test_method_data_pointer(Register mdp,
+                                                         Label& zero_continue) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  ldr(mdp, Address(rfp, frame::interpreter_frame_mdp_offset * wordSize));
+  cbz(mdp, zero_continue);
+}
+
+// Set the method data pointer for the current bcp.
+void InterpreterMacroAssembler::set_method_data_pointer_for_bcp() {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  Label set_mdp;
+  stp(r0, r1, Address(pre(sp, -2 * wordSize)));
+
+  // Test MDO to avoid the call if it is NULL.
+  ldr(r0, Address(rmethod, in_bytes(Method::method_data_offset())));
+  cbz(r0, set_mdp);
+  call_VM_leaf(CAST_FROM_FN_PTR(address, InterpreterRuntime::bcp_to_di), rmethod, rbcp);
+  // r0: mdi
+  // mdo is guaranteed to be non-zero here, we checked for it before the call.
+  ldr(r1, Address(rmethod, in_bytes(Method::method_data_offset())));
+  lea(r1, Address(r1, in_bytes(MethodData::data_offset())));
+  add(r0, r1, r0);
+  str(r0, Address(rfp, frame::interpreter_frame_mdp_offset * wordSize));
+  bind(set_mdp);
+  ldp(r0, r1, Address(post(sp, 2 * wordSize)));
+}
+
+void InterpreterMacroAssembler::verify_method_data_pointer() {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+#ifdef ASSERT
+  Label verify_continue;
+  stp(r0, r1, Address(pre(sp, -2 * wordSize)));
+  stp(r2, r3, Address(pre(sp, -2 * wordSize)));
+  test_method_data_pointer(r3, verify_continue); // If mdp is zero, continue
+  get_method(r1);
+
+  // If the mdp is valid, it will point to a DataLayout header which is
+  // consistent with the bcp.  The converse is highly probable also.
+  ldrsh(r2, Address(r3, in_bytes(DataLayout::bci_offset())));
+  ldr(rscratch1, Address(r1, Method::const_offset()));
+  add(r2, r2, rscratch1, Assembler::LSL);
+  lea(r2, Address(r2, ConstMethod::codes_offset()));
+  cmp(r2, rbcp);
+  br(Assembler::EQ, verify_continue);
+  // r1: method
+  // rbcp: bcp // rbcp == 22
+  // r3: mdp
+  call_VM_leaf(CAST_FROM_FN_PTR(address, InterpreterRuntime::verify_mdp),
+               r1, rbcp, r3);
+  bind(verify_continue);
+  ldp(r2, r3, Address(post(sp, 2 * wordSize)));
+  ldp(r0, r1, Address(post(sp, 2 * wordSize)));
+#endif // ASSERT
+}
+
+
+void InterpreterMacroAssembler::set_mdp_data_at(Register mdp_in,
+                                                int constant,
+                                                Register value) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  Address data(mdp_in, constant);
+  str(value, data);
+}
+
+
+void InterpreterMacroAssembler::increment_mdp_data_at(Register mdp_in,
+                                                      int constant,
+                                                      bool decrement) {
+  increment_mdp_data_at(mdp_in, noreg, constant, decrement);
+}
+
+void InterpreterMacroAssembler::increment_mdp_data_at(Register mdp_in,
+                                                      Register reg,
+                                                      int constant,
+                                                      bool decrement) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  // %%% this does 64bit counters at best it is wasting space
+  // at worst it is a rare bug when counters overflow
+
+  assert_different_registers(rscratch2, rscratch1, mdp_in, reg);
+
+  Address addr1(mdp_in, constant);
+  Address addr2(rscratch2, reg, Address::lsl(0));
+  Address &addr = addr1;
+  if (reg != noreg) {
+    lea(rscratch2, addr1);
+    addr = addr2;
+  }
+
+  if (decrement) {
+    // Decrement the register.  Set condition codes.
+    // Intel does this
+    // addptr(data, (int32_t) -DataLayout::counter_increment);
+    // If the decrement causes the counter to overflow, stay negative
+    // Label L;
+    // jcc(Assembler::negative, L);
+    // addptr(data, (int32_t) DataLayout::counter_increment);
+    // so we do this
+    ldr(rscratch1, addr);
+    subs(rscratch1, rscratch1, (unsigned)DataLayout::counter_increment);
+    Label L;
+    br(Assembler::LO, L);       // skip store if counter underflow
+    str(rscratch1, addr);
+    bind(L);
+  } else {
+    assert(DataLayout::counter_increment == 1,
+           "flow-free idiom only works with 1");
+    // Intel does this
+    // Increment the register.  Set carry flag.
+    // addptr(data, DataLayout::counter_increment);
+    // If the increment causes the counter to overflow, pull back by 1.
+    // sbbptr(data, (int32_t)0);
+    // so we do this
+    ldr(rscratch1, addr);
+    adds(rscratch1, rscratch1, DataLayout::counter_increment);
+    Label L;
+    br(Assembler::CS, L);       // skip store if counter overflow
+    str(rscratch1, addr);
+    bind(L);
+  }
+}
+
+void InterpreterMacroAssembler::set_mdp_flag_at(Register mdp_in,
+                                                int flag_byte_constant) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  int header_offset = in_bytes(DataLayout::header_offset());
+  int header_bits = DataLayout::flag_mask_to_header_mask(flag_byte_constant);
+  // Set the flag
+  ldr(rscratch1, Address(mdp_in, header_offset));
+  orr(rscratch1, rscratch1, header_bits);
+  str(rscratch1, Address(mdp_in, header_offset));
+}
+
+
+void InterpreterMacroAssembler::test_mdp_data_at(Register mdp_in,
+                                                 int offset,
+                                                 Register value,
+                                                 Register test_value_out,
+                                                 Label& not_equal_continue) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  if (test_value_out == noreg) {
+    ldr(rscratch1, Address(mdp_in, offset));
+    cmp(value, rscratch1);
+  } else {
+    // Put the test value into a register, so caller can use it:
+    ldr(test_value_out, Address(mdp_in, offset));
+    cmp(value, test_value_out);
+  }
+  br(Assembler::NE, not_equal_continue);
+}
+
+
+void InterpreterMacroAssembler::update_mdp_by_offset(Register mdp_in,
+                                                     int offset_of_disp) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  ldr(rscratch1, Address(mdp_in, offset_of_disp));
+  add(mdp_in, mdp_in, rscratch1, LSL);
+  str(mdp_in, Address(rfp, frame::interpreter_frame_mdp_offset * wordSize));
+}
+
+
+void InterpreterMacroAssembler::update_mdp_by_offset(Register mdp_in,
+                                                     Register reg,
+                                                     int offset_of_disp) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  lea(rscratch1, Address(mdp_in, offset_of_disp));
+  ldr(rscratch1, Address(rscratch1, reg, Address::lsl(0)));
+  add(mdp_in, mdp_in, rscratch1, LSL);
+  str(mdp_in, Address(rfp, frame::interpreter_frame_mdp_offset * wordSize));
+}
+
+
+void InterpreterMacroAssembler::update_mdp_by_constant(Register mdp_in,
+                                                       int constant) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  add(mdp_in, mdp_in, (unsigned)constant);
+  str(mdp_in, Address(rfp, frame::interpreter_frame_mdp_offset * wordSize));
+}
+
+
+void InterpreterMacroAssembler::update_mdp_for_ret(Register return_bci) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  // save/restore across call_VM
+  stp(zr, return_bci, Address(pre(sp, -2 * wordSize)));
+  call_VM(noreg,
+          CAST_FROM_FN_PTR(address, InterpreterRuntime::update_mdp_for_ret),
+          return_bci);
+  ldp(zr, return_bci, Address(post(sp, 2 * wordSize)));
+}
+
+
+void InterpreterMacroAssembler::profile_taken_branch(Register mdp,
+                                                     Register bumped_count) {
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    // Otherwise, assign to mdp
+    test_method_data_pointer(mdp, profile_continue);
+
+    // We are taking a branch.  Increment the taken count.
+    // We inline increment_mdp_data_at to return bumped_count in a register
+    //increment_mdp_data_at(mdp, in_bytes(JumpData::taken_offset()));
+    Address data(mdp, in_bytes(JumpData::taken_offset()));
+    ldr(bumped_count, data);
+    assert(DataLayout::counter_increment == 1,
+            "flow-free idiom only works with 1");
+    // Intel does this to catch overflow
+    // addptr(bumped_count, DataLayout::counter_increment);
+    // sbbptr(bumped_count, 0);
+    // so we do this
+    adds(bumped_count, bumped_count, DataLayout::counter_increment);
+    Label L;
+    br(Assembler::CS, L);       // skip store if counter overflow
+    str(bumped_count, data);
+    bind(L);
+    // The method data pointer needs to be updated to reflect the new target.
+    update_mdp_by_offset(mdp, in_bytes(JumpData::displacement_offset()));
+    bind(profile_continue);
+  }
+}
+
+
+void InterpreterMacroAssembler::profile_not_taken_branch(Register mdp) {
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    // We are taking a branch.  Increment the not taken count.
+    increment_mdp_data_at(mdp, in_bytes(BranchData::not_taken_offset()));
+
+    // The method data pointer needs to be updated to correspond to
+    // the next bytecode
+    update_mdp_by_constant(mdp, in_bytes(BranchData::branch_data_size()));
+    bind(profile_continue);
+  }
+}
+
+
+void InterpreterMacroAssembler::profile_call(Register mdp) {
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    // We are making a call.  Increment the count.
+    increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()));
+
+    // The method data pointer needs to be updated to reflect the new target.
+    update_mdp_by_constant(mdp, in_bytes(CounterData::counter_data_size()));
+    bind(profile_continue);
+  }
+}
+
+void InterpreterMacroAssembler::profile_final_call(Register mdp) {
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    // We are making a call.  Increment the count.
+    increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()));
+
+    // The method data pointer needs to be updated to reflect the new target.
+    update_mdp_by_constant(mdp,
+                           in_bytes(VirtualCallData::
+                                    virtual_call_data_size()));
+    bind(profile_continue);
+  }
+}
+
+
+void InterpreterMacroAssembler::profile_virtual_call(Register receiver,
+                                                     Register mdp,
+                                                     Register reg2,
+                                                     bool receiver_can_be_null) {
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    Label skip_receiver_profile;
+    if (receiver_can_be_null) {
+      Label not_null;
+      // We are making a call.  Increment the count for null receiver.
+      increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()));
+      b(skip_receiver_profile);
+      bind(not_null);
+    }
+
+    // Record the receiver type.
+    record_klass_in_profile(receiver, mdp, reg2, true);
+    bind(skip_receiver_profile);
+
+    // The method data pointer needs to be updated to reflect the new target.
+    update_mdp_by_constant(mdp,
+                           in_bytes(VirtualCallData::
+                                    virtual_call_data_size()));
+    bind(profile_continue);
+  }
+}
+
+// This routine creates a state machine for updating the multi-row
+// type profile at a virtual call site (or other type-sensitive bytecode).
+// The machine visits each row (of receiver/count) until the receiver type
+// is found, or until it runs out of rows.  At the same time, it remembers
+// the location of the first empty row.  (An empty row records null for its
+// receiver, and can be allocated for a newly-observed receiver type.)
+// Because there are two degrees of freedom in the state, a simple linear
+// search will not work; it must be a decision tree.  Hence this helper
+// function is recursive, to generate the required tree structured code.
+// It's the interpreter, so we are trading off code space for speed.
+// See below for example code.
+void InterpreterMacroAssembler::record_klass_in_profile_helper(
+                                        Register receiver, Register mdp,
+                                        Register reg2, int start_row,
+                                        Label& done, bool is_virtual_call) {
+  if (TypeProfileWidth == 0) {
+    if (is_virtual_call) {
+      increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()));
+    }
+    return;
+  }
+
+  int last_row = VirtualCallData::row_limit() - 1;
+  assert(start_row <= last_row, "must be work left to do");
+  // Test this row for both the receiver and for null.
+  // Take any of three different outcomes:
+  //   1. found receiver => increment count and goto done
+  //   2. found null => keep looking for case 1, maybe allocate this cell
+  //   3. found something else => keep looking for cases 1 and 2
+  // Case 3 is handled by a recursive call.
+  for (int row = start_row; row <= last_row; row++) {
+    Label next_test;
+    bool test_for_null_also = (row == start_row);
+
+    // See if the receiver is receiver[n].
+    int recvr_offset = in_bytes(VirtualCallData::receiver_offset(row));
+    test_mdp_data_at(mdp, recvr_offset, receiver,
+                     (test_for_null_also ? reg2 : noreg),
+                     next_test);
+    // (Reg2 now contains the receiver from the CallData.)
+
+    // The receiver is receiver[n].  Increment count[n].
+    int count_offset = in_bytes(VirtualCallData::receiver_count_offset(row));
+    increment_mdp_data_at(mdp, count_offset);
+    b(done);
+    bind(next_test);
+
+    if (test_for_null_also) {
+      Label found_null;
+      // Failed the equality check on receiver[n]...  Test for null.
+      if (start_row == last_row) {
+        // The only thing left to do is handle the null case.
+        if (is_virtual_call) {
+          cbz(reg2, found_null);
+          // Receiver did not match any saved receiver and there is no empty row for it.
+          // Increment total counter to indicate polymorphic case.
+          increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()));
+          b(done);
+          bind(found_null);
+        } else {
+          cbz(reg2, done);
+        }
+        break;
+      }
+      // Since null is rare, make it be the branch-taken case.
+      cbz(reg2,found_null);
+
+      // Put all the "Case 3" tests here.
+      record_klass_in_profile_helper(receiver, mdp, reg2, start_row + 1, done, is_virtual_call);
+
+      // Found a null.  Keep searching for a matching receiver,
+      // but remember that this is an empty (unused) slot.
+      bind(found_null);
+    }
+  }
+
+  // In the fall-through case, we found no matching receiver, but we
+  // observed the receiver[start_row] is NULL.
+
+  // Fill in the receiver field and increment the count.
+  int recvr_offset = in_bytes(VirtualCallData::receiver_offset(start_row));
+  set_mdp_data_at(mdp, recvr_offset, receiver);
+  int count_offset = in_bytes(VirtualCallData::receiver_count_offset(start_row));
+  mov(reg2, DataLayout::counter_increment);
+  set_mdp_data_at(mdp, count_offset, reg2);
+  if (start_row > 0) {
+    b(done);
+  }
+}
+
+// Example state machine code for three profile rows:
+//   // main copy of decision tree, rooted at row[1]
+//   if (row[0].rec == rec) { row[0].incr(); goto done; }
+//   if (row[0].rec != NULL) {
+//     // inner copy of decision tree, rooted at row[1]
+//     if (row[1].rec == rec) { row[1].incr(); goto done; }
+//     if (row[1].rec != NULL) {
+//       // degenerate decision tree, rooted at row[2]
+//       if (row[2].rec == rec) { row[2].incr(); goto done; }
+//       if (row[2].rec != NULL) { count.incr(); goto done; } // overflow
+//       row[2].init(rec); goto done;
+//     } else {
+//       // remember row[1] is empty
+//       if (row[2].rec == rec) { row[2].incr(); goto done; }
+//       row[1].init(rec); goto done;
+//     }
+//   } else {
+//     // remember row[0] is empty
+//     if (row[1].rec == rec) { row[1].incr(); goto done; }
+//     if (row[2].rec == rec) { row[2].incr(); goto done; }
+//     row[0].init(rec); goto done;
+//   }
+//   done:
+
+void InterpreterMacroAssembler::record_klass_in_profile(Register receiver,
+                                                        Register mdp, Register reg2,
+                                                        bool is_virtual_call) {
+  assert(ProfileInterpreter, "must be profiling");
+  Label done;
+
+  record_klass_in_profile_helper(receiver, mdp, reg2, 0, done, is_virtual_call);
+
+  bind (done);
+}
+
+void InterpreterMacroAssembler::profile_ret(Register return_bci,
+                                            Register mdp) {
+  if (ProfileInterpreter) {
+    Label profile_continue;
+    uint row;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    // Update the total ret count.
+    increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()));
+
+    for (row = 0; row < RetData::row_limit(); row++) {
+      Label next_test;
+
+      // See if return_bci is equal to bci[n]:
+      test_mdp_data_at(mdp,
+                       in_bytes(RetData::bci_offset(row)),
+                       return_bci, noreg,
+                       next_test);
+
+      // return_bci is equal to bci[n].  Increment the count.
+      increment_mdp_data_at(mdp, in_bytes(RetData::bci_count_offset(row)));
+
+      // The method data pointer needs to be updated to reflect the new target.
+      update_mdp_by_offset(mdp,
+                           in_bytes(RetData::bci_displacement_offset(row)));
+      b(profile_continue);
+      bind(next_test);
+    }
+
+    update_mdp_for_ret(return_bci);
+
+    bind(profile_continue);
+  }
+}
+
+void InterpreterMacroAssembler::profile_null_seen(Register mdp) {
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    set_mdp_flag_at(mdp, BitData::null_seen_byte_constant());
+
+    // The method data pointer needs to be updated.
+    int mdp_delta = in_bytes(BitData::bit_data_size());
+    if (TypeProfileCasts) {
+      mdp_delta = in_bytes(VirtualCallData::virtual_call_data_size());
+    }
+    update_mdp_by_constant(mdp, mdp_delta);
+
+    bind(profile_continue);
+  }
+}
+
+void InterpreterMacroAssembler::profile_typecheck_failed(Register mdp) {
+  if (ProfileInterpreter && TypeProfileCasts) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    int count_offset = in_bytes(CounterData::count_offset());
+    // Back up the address, since we have already bumped the mdp.
+    count_offset -= in_bytes(VirtualCallData::virtual_call_data_size());
+
+    // *Decrement* the counter.  We expect to see zero or small negatives.
+    increment_mdp_data_at(mdp, count_offset, true);
+
+    bind (profile_continue);
+  }
+}
+
+void InterpreterMacroAssembler::profile_typecheck(Register mdp, Register klass, Register reg2) {
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    // The method data pointer needs to be updated.
+    int mdp_delta = in_bytes(BitData::bit_data_size());
+    if (TypeProfileCasts) {
+      mdp_delta = in_bytes(VirtualCallData::virtual_call_data_size());
+
+      // Record the object type.
+      record_klass_in_profile(klass, mdp, reg2, false);
+    }
+    update_mdp_by_constant(mdp, mdp_delta);
+
+    bind(profile_continue);
+  }
+}
+
+void InterpreterMacroAssembler::profile_switch_default(Register mdp) {
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    // Update the default case count
+    increment_mdp_data_at(mdp,
+                          in_bytes(MultiBranchData::default_count_offset()));
+
+    // The method data pointer needs to be updated.
+    update_mdp_by_offset(mdp,
+                         in_bytes(MultiBranchData::
+                                  default_displacement_offset()));
+
+    bind(profile_continue);
+  }
+}
+
+void InterpreterMacroAssembler::profile_switch_case(Register index,
+                                                    Register mdp,
+                                                    Register reg2) {
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    // Build the base (index * per_case_size_in_bytes()) +
+    // case_array_offset_in_bytes()
+    movw(reg2, in_bytes(MultiBranchData::per_case_size()));
+    movw(rscratch1, in_bytes(MultiBranchData::case_array_offset()));
+    maddw(index, index, reg2, rscratch1);
+
+    // Update the case count
+    increment_mdp_data_at(mdp,
+                          index,
+                          in_bytes(MultiBranchData::relative_count_offset()));
+
+    // The method data pointer needs to be updated.
+    update_mdp_by_offset(mdp,
+                         index,
+                         in_bytes(MultiBranchData::
+                                  relative_displacement_offset()));
+
+    bind(profile_continue);
+  }
+}
+
+void InterpreterMacroAssembler::verify_oop(Register reg, TosState state) {
+  if (state == atos) {
+    MacroAssembler::verify_oop(reg);
+  }
+}
+
+void InterpreterMacroAssembler::verify_FPU(int stack_depth, TosState state) { ; }
+#endif // !CC_INTERP
+
+
+void InterpreterMacroAssembler::notify_method_entry() {
+  // Whenever JVMTI is interp_only_mode, method entry/exit events are sent to
+  // track stack depth.  If it is possible to enter interp_only_mode we add
+  // the code to check if the event should be sent.
+  if (JvmtiExport::can_post_interpreter_events()) {
+    Label L;
+    ldr(r3, Address(rthread, JavaThread::interp_only_mode_offset()));
+    tst(r3, ~0);
+    br(Assembler::EQ, L);
+    call_VM(noreg, CAST_FROM_FN_PTR(address,
+                                    InterpreterRuntime::post_method_entry));
+    bind(L);
+  }
+
+  {
+    SkipIfEqual skip(this, &DTraceMethodProbes, false);
+    get_method(c_rarg1);
+    call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
+                 rthread, c_rarg1);
+  }
+
+  // RedefineClasses() tracing support for obsolete method entry
+  if (RC_TRACE_IN_RANGE(0x00001000, 0x00002000)) {
+    get_method(c_rarg1);
+    call_VM_leaf(
+      CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
+      rthread, c_rarg1);
+  }
+
+ }
+
+
+void InterpreterMacroAssembler::notify_method_exit(
+    TosState state, NotifyMethodExitMode mode) {
+  // Whenever JVMTI is interp_only_mode, method entry/exit events are sent to
+  // track stack depth.  If it is possible to enter interp_only_mode we add
+  // the code to check if the event should be sent.
+  if (mode == NotifyJVMTI && JvmtiExport::can_post_interpreter_events()) {
+    Label L;
+    // Note: frame::interpreter_frame_result has a dependency on how the
+    // method result is saved across the call to post_method_exit. If this
+    // is changed then the interpreter_frame_result implementation will
+    // need to be updated too.
+
+    // For c++ interpreter the result is always stored at a known location in the frame
+    // template interpreter will leave it on the top of the stack.
+    NOT_CC_INTERP(push(state);)
+    ldrw(r3, Address(rthread, JavaThread::interp_only_mode_offset()));
+    cbz(r3, L);
+    call_VM(noreg,
+            CAST_FROM_FN_PTR(address, InterpreterRuntime::post_method_exit));
+    bind(L);
+    NOT_CC_INTERP(pop(state));
+  }
+
+  {
+    SkipIfEqual skip(this, &DTraceMethodProbes, false);
+    NOT_CC_INTERP(push(state));
+    get_method(c_rarg1);
+    call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
+                 rthread, c_rarg1);
+    NOT_CC_INTERP(pop(state));
+  }
+}
+
+
+// Jump if ((*counter_addr += increment) & mask) satisfies the condition.
+void InterpreterMacroAssembler::increment_mask_and_jump(Address counter_addr,
+                                                        int increment, int mask,
+                                                        Register scratch, bool preloaded,
+                                                        Condition cond, Label* where) {
+  if (!preloaded) {
+    ldrw(scratch, counter_addr);
+  }
+  add(scratch, scratch, increment);
+  strw(scratch, counter_addr);
+  ands(scratch, scratch, mask);
+  br(cond, *where);
+}
+
+void InterpreterMacroAssembler::call_VM_leaf_base(address entry_point,
+                                                  int number_of_arguments) {
+  // interpreter specific
+  //
+  // Note: No need to save/restore rbcp & rlocals pointer since these
+  //       are callee saved registers and no blocking/ GC can happen
+  //       in leaf calls.
+#ifdef ASSERT
+  {
+    Label L;
+    ldr(rscratch1, Address(rfp, frame::interpreter_frame_last_sp_offset * wordSize));
+    cbz(rscratch1, L);
+    stop("InterpreterMacroAssembler::call_VM_leaf_base:"
+         " last_sp != NULL");
+    bind(L);
+  }
+#endif /* ASSERT */
+  // super call
+  MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments);
+}
+
+void InterpreterMacroAssembler::call_VM_base(Register oop_result,
+                                             Register java_thread,
+                                             Register last_java_sp,
+                                             address  entry_point,
+                                             int      number_of_arguments,
+                                             bool     check_exceptions) {
+  // interpreter specific
+  //
+  // Note: Could avoid restoring locals ptr (callee saved) - however doesn't
+  //       really make a difference for these runtime calls, since they are
+  //       slow anyway. Btw., bcp must be saved/restored since it may change
+  //       due to GC.
+  // assert(java_thread == noreg , "not expecting a precomputed java thread");
+  save_bcp();
+#ifdef ASSERT
+  {
+    Label L;
+    ldr(rscratch1, Address(rfp, frame::interpreter_frame_last_sp_offset * wordSize));
+    cbz(rscratch1, L);
+    stop("InterpreterMacroAssembler::call_VM_leaf_base:"
+         " last_sp != NULL");
+    bind(L);
+  }
+#endif /* ASSERT */
+  // super call
+  MacroAssembler::call_VM_base(oop_result, noreg, last_java_sp,
+                               entry_point, number_of_arguments,
+                     check_exceptions);
+// interpreter specific
+  restore_bcp();
+  restore_locals();
+}
+
+void InterpreterMacroAssembler::profile_obj_type(Register obj, const Address& mdo_addr) {
+  Label update, next, none;
+
+  verify_oop(obj);
+
+  cbnz(obj, update);
+  orptr(mdo_addr, TypeEntries::null_seen);
+  b(next);
+
+  bind(update);
+  load_klass(obj, obj);
+
+  ldr(rscratch1, mdo_addr);
+  eor(obj, obj, rscratch1);
+  tst(obj, TypeEntries::type_klass_mask);
+  br(Assembler::EQ, next); // klass seen before, nothing to
+                           // do. The unknown bit may have been
+                           // set already but no need to check.
+
+  tst(obj, TypeEntries::type_unknown);
+  br(Assembler::NE, next); // already unknown. Nothing to do anymore.
+
+  ldr(rscratch1, mdo_addr);
+  cbz(rscratch1, none);
+  cmp(rscratch1, TypeEntries::null_seen);
+  br(Assembler::EQ, none);
+  // There is a chance that the checks above (re-reading profiling
+  // data from memory) fail if another thread has just set the
+  // profiling to this obj's klass
+  ldr(rscratch1, mdo_addr);
+  eor(obj, obj, rscratch1);
+  tst(obj, TypeEntries::type_klass_mask);
+  br(Assembler::EQ, next);
+
+  // different than before. Cannot keep accurate profile.
+  orptr(mdo_addr, TypeEntries::type_unknown);
+  b(next);
+
+  bind(none);
+  // first time here. Set profile type.
+  str(obj, mdo_addr);
+
+  bind(next);
+}
+
+void InterpreterMacroAssembler::profile_arguments_type(Register mdp, Register callee, Register tmp, bool is_virtual) {
+  if (!ProfileInterpreter) {
+    return;
+  }
+
+  if (MethodData::profile_arguments() || MethodData::profile_return()) {
+    Label profile_continue;
+
+    test_method_data_pointer(mdp, profile_continue);
+
+    int off_to_start = is_virtual ? in_bytes(VirtualCallData::virtual_call_data_size()) : in_bytes(CounterData::counter_data_size());
+
+    ldrb(rscratch1, Address(mdp, in_bytes(DataLayout::tag_offset()) - off_to_start));
+    cmp(rscratch1, is_virtual ? DataLayout::virtual_call_type_data_tag : DataLayout::call_type_data_tag);
+    br(Assembler::NE, profile_continue);
+
+    if (MethodData::profile_arguments()) {
+      Label done;
+      int off_to_args = in_bytes(TypeEntriesAtCall::args_data_offset());
+      add(mdp, mdp, off_to_args);
+
+      for (int i = 0; i < TypeProfileArgsLimit; i++) {
+        if (i > 0 || MethodData::profile_return()) {
+          // If return value type is profiled we may have no argument to profile
+          ldr(tmp, Address(mdp, in_bytes(TypeEntriesAtCall::cell_count_offset())-off_to_args));
+          sub(tmp, tmp, i*TypeStackSlotEntries::per_arg_count());
+          cmp(tmp, TypeStackSlotEntries::per_arg_count());
+          br(Assembler::LT, done);
+        }
+        ldr(tmp, Address(callee, Method::const_offset()));
+        load_unsigned_short(tmp, Address(tmp, ConstMethod::size_of_parameters_offset()));
+        // stack offset o (zero based) from the start of the argument
+        // list, for n arguments translates into offset n - o - 1 from
+        // the end of the argument list
+        ldr(rscratch1, Address(mdp, in_bytes(TypeEntriesAtCall::stack_slot_offset(i))-off_to_args));
+        sub(tmp, tmp, rscratch1);
+        sub(tmp, tmp, 1);
+        Address arg_addr = argument_address(tmp);
+        ldr(tmp, arg_addr);
+
+        Address mdo_arg_addr(mdp, in_bytes(TypeEntriesAtCall::argument_type_offset(i))-off_to_args);
+        profile_obj_type(tmp, mdo_arg_addr);
+
+        int to_add = in_bytes(TypeStackSlotEntries::per_arg_size());
+        add(mdp, mdp, to_add);
+        off_to_args += to_add;
+      }
+
+      if (MethodData::profile_return()) {
+        ldr(tmp, Address(mdp, in_bytes(TypeEntriesAtCall::cell_count_offset())-off_to_args));
+        sub(tmp, tmp, TypeProfileArgsLimit*TypeStackSlotEntries::per_arg_count());
+      }
+
+      bind(done);
+
+      if (MethodData::profile_return()) {
+        // We're right after the type profile for the last
+        // argument. tmp is the number of cells left in the
+        // CallTypeData/VirtualCallTypeData to reach its end. Non null
+        // if there's a return to profile.
+        assert(ReturnTypeEntry::static_cell_count() < TypeStackSlotEntries::per_arg_count(), "can't move past ret type");
+        add(mdp, mdp, tmp, LSL, exact_log2(DataLayout::cell_size));
+      }
+      str(mdp, Address(rfp, frame::interpreter_frame_mdp_offset * wordSize));
+    } else {
+      assert(MethodData::profile_return(), "either profile call args or call ret");
+      update_mdp_by_constant(mdp, in_bytes(ReturnTypeEntry::size()));
+    }
+
+    // mdp points right after the end of the
+    // CallTypeData/VirtualCallTypeData, right after the cells for the
+    // return value type if there's one
+
+    bind(profile_continue);
+  }
+}
+
+void InterpreterMacroAssembler::profile_return_type(Register mdp, Register ret, Register tmp) {
+  assert_different_registers(mdp, ret, tmp, rbcp);
+  if (ProfileInterpreter && MethodData::profile_return()) {
+    Label profile_continue, done;
+
+    test_method_data_pointer(mdp, profile_continue);
+
+    if (MethodData::profile_return_jsr292_only()) {
+      // If we don't profile all invoke bytecodes we must make sure
+      // it's a bytecode we indeed profile. We can't go back to the
+      // begining of the ProfileData we intend to update to check its
+      // type because we're right after it and we don't known its
+      // length
+      Label do_profile;
+      ldrb(rscratch1, Address(rbcp, 0));
+      cmp(rscratch1, Bytecodes::_invokedynamic);
+      br(Assembler::EQ, do_profile);
+      cmp(rscratch1, Bytecodes::_invokehandle);
+      br(Assembler::EQ, do_profile);
+      get_method(tmp);
+      ldrb(rscratch1, Address(tmp, Method::intrinsic_id_offset_in_bytes()));
+      cmp(rscratch1, vmIntrinsics::_compiledLambdaForm);
+      br(Assembler::NE, profile_continue);
+
+      bind(do_profile);
+    }
+
+    Address mdo_ret_addr(mdp, -in_bytes(ReturnTypeEntry::size()));
+    mov(tmp, ret);
+    profile_obj_type(tmp, mdo_ret_addr);
+
+    bind(profile_continue);
+  }
+}
+
+void InterpreterMacroAssembler::profile_parameters_type(Register mdp, Register tmp1, Register tmp2) {
+  if (ProfileInterpreter && MethodData::profile_parameters()) {
+    Label profile_continue, done;
+
+    test_method_data_pointer(mdp, profile_continue);
+
+    // Load the offset of the area within the MDO used for
+    // parameters. If it's negative we're not profiling any parameters
+    ldr(tmp1, Address(mdp, in_bytes(MethodData::parameters_type_data_di_offset()) - in_bytes(MethodData::data_offset())));
+    cmp(tmp1, 0u);
+    br(Assembler::LT, profile_continue);
+
+    // Compute a pointer to the area for parameters from the offset
+    // and move the pointer to the slot for the last
+    // parameters. Collect profiling from last parameter down.
+    // mdo start + parameters offset + array length - 1
+    add(mdp, mdp, tmp1);
+    ldr(tmp1, Address(mdp, ArrayData::array_len_offset()));
+    sub(tmp1, tmp1, TypeStackSlotEntries::per_arg_count());
+
+    Label loop;
+    bind(loop);
+
+    int off_base = in_bytes(ParametersTypeData::stack_slot_offset(0));
+    int type_base = in_bytes(ParametersTypeData::type_offset(0));
+    int per_arg_scale = exact_log2(DataLayout::cell_size);
+    add(rscratch1, mdp, off_base);
+    add(rscratch2, mdp, type_base);
+
+    Address arg_off(rscratch1, tmp1, Address::lsl(per_arg_scale));
+    Address arg_type(rscratch2, tmp1, Address::lsl(per_arg_scale));
+
+    // load offset on the stack from the slot for this parameter
+    ldr(tmp2, arg_off);
+    neg(tmp2, tmp2);
+    // read the parameter from the local area
+    ldr(tmp2, Address(rlocals, tmp2, Address::lsl(Interpreter::logStackElementSize)));
+
+    // profile the parameter
+    profile_obj_type(tmp2, arg_type);
+
+    // go to next parameter
+    subs(tmp1, tmp1, TypeStackSlotEntries::per_arg_count());
+    br(Assembler::GE, loop);
+
+    bind(profile_continue);
+  }
+}

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/interp_masm_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,291 @@
+/*
+ * Copyright (c) 2003, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_INTERP_MASM_AARCH64_64_HPP
+#define CPU_AARCH64_VM_INTERP_MASM_AARCH64_64_HPP
+
+#include "asm/macroAssembler.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "interpreter/invocationCounter.hpp"
+#include "runtime/frame.hpp"
+
+// This file specializes the assember with interpreter-specific macros
+
+
+class InterpreterMacroAssembler: public MacroAssembler {
+#ifndef CC_INTERP
+ protected:
+
+ protected:
+  // Interpreter specific version of call_VM_base
+  virtual void call_VM_leaf_base(address entry_point,
+                                 int number_of_arguments);
+
+  virtual void call_VM_base(Register oop_result,
+                            Register java_thread,
+                            Register last_java_sp,
+                            address  entry_point,
+                            int number_of_arguments,
+                            bool check_exceptions);
+
+  virtual void check_and_handle_popframe(Register java_thread);
+  virtual void check_and_handle_earlyret(Register java_thread);
+
+  // base routine for all dispatches
+  void dispatch_base(TosState state, address* table, bool verifyoop = true);
+#endif // CC_INTERP
+
+ public:
+  InterpreterMacroAssembler(CodeBuffer* code) : MacroAssembler(code) {}
+
+  void load_earlyret_value(TosState state);
+
+#ifdef CC_INTERP
+  void save_bcp()                                          { /*  not needed in c++ interpreter and harmless */ }
+  void restore_bcp()                                       { /*  not needed in c++ interpreter and harmless */ }
+
+  // Helpers for runtime call arguments/results
+  void get_method(Register reg);
+
+#else
+
+  // Interpreter-specific registers
+  void save_bcp() {
+    str(rbcp, Address(rfp, frame::interpreter_frame_bcp_offset * wordSize));
+  }
+
+  void restore_bcp() {
+    ldr(rbcp, Address(rfp, frame::interpreter_frame_bcp_offset * wordSize));
+  }
+
+  void restore_locals() {
+    ldr(rlocals, Address(rfp, frame::interpreter_frame_locals_offset * wordSize));
+  }
+
+  void restore_constant_pool_cache() {
+    ldr(rcpool, Address(rfp, frame::interpreter_frame_cache_offset * wordSize));
+  }
+
+  void get_dispatch();
+
+  // Helpers for runtime call arguments/results
+
+  // Helpers for runtime call arguments/results
+  void get_method(Register reg) {
+    ldr(reg, Address(rfp, frame::interpreter_frame_method_offset * wordSize));
+  }
+
+  void get_const(Register reg) {
+    get_method(reg);
+    ldr(reg, Address(reg, in_bytes(Method::const_offset())));
+  }
+
+  void get_constant_pool(Register reg) {
+    get_const(reg);
+    ldr(reg, Address(reg, in_bytes(ConstMethod::constants_offset())));
+  }
+
+  void get_constant_pool_cache(Register reg) {
+    get_constant_pool(reg);
+    ldr(reg, Address(reg, ConstantPool::cache_offset_in_bytes()));
+  }
+
+  void get_cpool_and_tags(Register cpool, Register tags) {
+    get_constant_pool(cpool);
+    ldr(tags, Address(cpool, ConstantPool::tags_offset_in_bytes()));
+  }
+
+  void get_unsigned_2_byte_index_at_bcp(Register reg, int bcp_offset);
+  void get_cache_and_index_at_bcp(Register cache, Register index, int bcp_offset, size_t index_size = sizeof(u2));
+  void get_cache_and_index_and_bytecode_at_bcp(Register cache, Register index, Register bytecode, int byte_no, int bcp_offset, size_t index_size = sizeof(u2));
+  void get_cache_entry_pointer_at_bcp(Register cache, Register tmp, int bcp_offset, size_t index_size = sizeof(u2));
+  void get_cache_index_at_bcp(Register index, int bcp_offset, size_t index_size = sizeof(u2));
+  void get_method_counters(Register method, Register mcs, Label& skip);
+
+  // load cpool->resolved_references(index);
+  void load_resolved_reference_at_index(Register result, Register index);
+
+  void pop_ptr(Register r = r0);
+  void pop_i(Register r = r0);
+  void pop_l(Register r = r0);
+  void pop_f(FloatRegister r = v0);
+  void pop_d(FloatRegister r = v0);
+  void push_ptr(Register r = r0);
+  void push_i(Register r = r0);
+  void push_l(Register r = r0);
+  void push_f(FloatRegister r = v0);
+  void push_d(FloatRegister r = v0);
+
+  void pop(Register r ) { ((MacroAssembler*)this)->pop(r); }
+
+  void push(Register r ) { ((MacroAssembler*)this)->push(r); }
+
+  void pop(TosState state); // transition vtos -> state
+  void push(TosState state); // transition state -> vtos
+
+  void pop(RegSet regs, Register stack) { ((MacroAssembler*)this)->pop(regs, stack); }
+  void push(RegSet regs, Register stack) { ((MacroAssembler*)this)->push(regs, stack); }
+
+  void empty_expression_stack() {
+    ldr(esp, Address(rfp, frame::interpreter_frame_monitor_block_top_offset * wordSize));
+    // NULL last_sp until next java call
+    str(zr, Address(rfp, frame::interpreter_frame_last_sp_offset * wordSize));
+  }
+
+  // Helpers for swap and dup
+  void load_ptr(int n, Register val);
+  void store_ptr(int n, Register val);
+
+  // Generate a subtype check: branch to ok_is_subtype if sub_klass is
+  // a subtype of super_klass.
+  void gen_subtype_check( Register sub_klass, Label &ok_is_subtype );
+
+  // Dispatching
+  void dispatch_prolog(TosState state, int step = 0);
+  void dispatch_epilog(TosState state, int step = 0);
+  // dispatch via rscratch1
+  void dispatch_only(TosState state);
+  // dispatch normal table via rscratch1 (assume rscratch1 is loaded already)
+  void dispatch_only_normal(TosState state);
+  void dispatch_only_noverify(TosState state);
+  // load rscratch1 from [rbcp + step] and dispatch via rscratch1
+  void dispatch_next(TosState state, int step = 0);
+  // load rscratch1 from [esi] and dispatch via rscratch1 and table
+  void dispatch_via (TosState state, address* table);
+
+  // jump to an invoked target
+  void prepare_to_jump_from_interpreted();
+  void jump_from_interpreted(Register method, Register temp);
+
+
+  // Returning from interpreted functions
+  //
+  // Removes the current activation (incl. unlocking of monitors)
+  // and sets up the return address.  This code is also used for
+  // exception unwindwing. In that case, we do not want to throw
+  // IllegalMonitorStateExceptions, since that might get us into an
+  // infinite rethrow exception loop.
+  // Additionally this code is used for popFrame and earlyReturn.
+  // In popFrame case we want to skip throwing an exception,
+  // installing an exception, and notifying jvmdi.
+  // In earlyReturn case we only want to skip throwing an exception
+  // and installing an exception.
+  void remove_activation(TosState state,
+                         bool throw_monitor_exception = true,
+                         bool install_monitor_exception = true,
+                         bool notify_jvmdi = true);
+#endif // CC_INTERP
+
+  // FIXME: Give us a valid frame at a null check.
+  virtual void null_check(Register reg, int offset = -1) {
+// #ifdef ASSERT
+//     save_bcp();
+//     set_last_Java_frame(esp, rfp, (address) pc());
+// #endif
+    MacroAssembler::null_check(reg, offset);
+// #ifdef ASSERT
+//     reset_last_Java_frame(true, false);
+// #endif
+  }
+
+  // Object locking
+  void lock_object  (Register lock_reg);
+  void unlock_object(Register lock_reg);
+
+#ifndef CC_INTERP
+
+  // Interpreter profiling operations
+  void set_method_data_pointer_for_bcp();
+  void test_method_data_pointer(Register mdp, Label& zero_continue);
+  void verify_method_data_pointer();
+
+  void set_mdp_data_at(Register mdp_in, int constant, Register value);
+  void increment_mdp_data_at(Address data, bool decrement = false);
+  void increment_mdp_data_at(Register mdp_in, int constant,
+                             bool decrement = false);
+  void increment_mdp_data_at(Register mdp_in, Register reg, int constant,
+                             bool decrement = false);
+  void increment_mask_and_jump(Address counter_addr,
+                               int increment, int mask,
+                               Register scratch, bool preloaded,
+                               Condition cond, Label* where);
+  void set_mdp_flag_at(Register mdp_in, int flag_constant);
+  void test_mdp_data_at(Register mdp_in, int offset, Register value,
+                        Register test_value_out,
+                        Label& not_equal_continue);
+
+  void record_klass_in_profile(Register receiver, Register mdp,
+                               Register reg2, bool is_virtual_call);
+  void record_klass_in_profile_helper(Register receiver, Register mdp,
+                                      Register reg2, int start_row,
+                                      Label& done, bool is_virtual_call);
+
+  void update_mdp_by_offset(Register mdp_in, int offset_of_offset);
+  void update_mdp_by_offset(Register mdp_in, Register reg, int offset_of_disp);
+  void update_mdp_by_constant(Register mdp_in, int constant);
+  void update_mdp_for_ret(Register return_bci);
+
+  void profile_taken_branch(Register mdp, Register bumped_count);
+  void profile_not_taken_branch(Register mdp);
+  void profile_call(Register mdp);
+  void profile_final_call(Register mdp);
+  void profile_virtual_call(Register receiver, Register mdp,
+                            Register scratch2,
+                            bool receiver_can_be_null = false);
+  void profile_ret(Register return_bci, Register mdp);
+  void profile_null_seen(Register mdp);
+  void profile_typecheck(Register mdp, Register klass, Register scratch);
+  void profile_typecheck_failed(Register mdp);
+  void profile_switch_default(Register mdp);
+  void profile_switch_case(Register index_in_scratch, Register mdp,
+                           Register scratch2);
+
+  void profile_obj_type(Register obj, const Address& mdo_addr);
+  void profile_arguments_type(Register mdp, Register callee, Register tmp, bool is_virtual);
+  void profile_return_type(Register mdp, Register ret, Register tmp);
+  void profile_parameters_type(Register mdp, Register tmp1, Register tmp2);
+
+  // Debugging
+  // only if +VerifyOops && state == atos
+  void verify_oop(Register reg, TosState state = atos);
+  // only if +VerifyFPU  && (state == ftos || state == dtos)
+  void verify_FPU(int stack_depth, TosState state = ftos);
+
+#endif // !CC_INTERP
+
+  typedef enum { NotifyJVMTI, SkipNotifyJVMTI } NotifyMethodExitMode;
+
+  // support for jvmti/dtrace
+  void notify_method_entry();
+  void notify_method_exit(TosState state, NotifyMethodExitMode mode);
+
+  virtual void _call_Unimplemented(address call_site) {
+    save_bcp();
+    set_last_Java_frame(esp, rfp, (address) pc(), rscratch1);
+    MacroAssembler::_call_Unimplemented(call_site);
+  }
+};
+
+#endif // CPU_AARCH64_VM_INTERP_MASM_AARCH64_64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/interpreterGenerator_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_INTERPRETERGENERATOR_AARCH64_HPP
+#define CPU_AARCH64_VM_INTERPRETERGENERATOR_AARCH64_HPP
+
+
+// Generation of Interpreter
+//
+  friend class AbstractInterpreterGenerator;
+
+protected:
+
+  void bang_stack_shadow_pages(bool native_call);
+
+private:
+
+  address generate_normal_entry(bool synchronized);
+  address generate_native_entry(bool synchronized);
+  address generate_abstract_entry(void);
+  address generate_math_entry(AbstractInterpreter::MethodKind kind);
+  address generate_jump_to_normal_entry(void);
+  address generate_accessor_entry(void) { return generate_jump_to_normal_entry(); }
+  address generate_empty_entry(void) { return generate_jump_to_normal_entry(); }
+  void generate_transcendental_entry(AbstractInterpreter::MethodKind kind, int fpargs);
+  address generate_Reference_get_entry();
+  address generate_CRC32_update_entry();
+  address generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind);
+  void lock_method(void);
+  void generate_stack_overflow_check(void);
+
+  void generate_counter_incr(Label* overflow, Label* profile_method, Label* profile_method_continue);
+  void generate_counter_overflow(Label* do_continue);
+
+#endif // CPU_AARCH64_VM_INTERPRETERGENERATOR_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/interpreterRT_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "interpreter/interpreter.hpp"
+#include "interpreter/interpreterRuntime.hpp"
+#include "memory/allocation.inline.hpp"
+#include "memory/universe.inline.hpp"
+#include "oops/method.hpp"
+#include "oops/oop.inline.hpp"
+#include "runtime/handles.inline.hpp"
+#include "runtime/icache.hpp"
+#include "runtime/interfaceSupport.hpp"
+#include "runtime/signature.hpp"
+
+#define __ _masm->
+
+// Implementation of SignatureHandlerGenerator
+Register InterpreterRuntime::SignatureHandlerGenerator::from() { return rlocals; }
+Register InterpreterRuntime::SignatureHandlerGenerator::to()   { return sp; }
+Register InterpreterRuntime::SignatureHandlerGenerator::temp() { return rscratch1; }
+
+void InterpreterRuntime::SignatureHandlerGenerator::pass_int() {
+  const Address src(from(), Interpreter::local_offset_in_bytes(offset()));
+
+  switch (_num_int_args) {
+  case 0:
+    __ ldr(c_rarg1, src);
+    _num_int_args++;
+    break;
+  case 1:
+    __ ldr(c_rarg2, src);
+    _num_int_args++;
+    break;
+  case 2:
+    __ ldr(c_rarg3, src);
+    _num_int_args++;
+    break;
+  case 3:
+    __ ldr(c_rarg4, src);
+    _num_int_args++;
+    break;
+  case 4:
+    __ ldr(c_rarg5, src);
+    _num_int_args++;
+    break;
+  case 5:
+    __ ldr(c_rarg6, src);
+    _num_int_args++;
+    break;
+  case 6:
+    __ ldr(c_rarg7, src);
+    _num_int_args++;
+    break;
+  default:
+    __ ldr(r0, src);
+    __ str(r0, Address(to(), _stack_offset));
+    _stack_offset += wordSize;
+    _num_int_args++;
+    break;
+  }
+}
+
+void InterpreterRuntime::SignatureHandlerGenerator::pass_long() {
+  const Address src(from(), Interpreter::local_offset_in_bytes(offset() + 1));
+
+  switch (_num_int_args) {
+  case 0:
+    __ ldr(c_rarg1, src);
+    _num_int_args++;
+    break;
+  case 1:
+    __ ldr(c_rarg2, src);
+    _num_int_args++;
+    break;
+  case 2:
+    __ ldr(c_rarg3, src);
+    _num_int_args++;
+    break;
+  case 3:
+    __ ldr(c_rarg4, src);
+    _num_int_args++;
+    break;
+  case 4:
+    __ ldr(c_rarg5, src);
+    _num_int_args++;
+    break;
+  case 5:
+    __ ldr(c_rarg6, src);
+    _num_int_args++;
+    break;
+  case 6:
+    __ ldr(c_rarg7, src);
+    _num_int_args++;
+    break;
+  default:
+    __ ldr(r0, src);
+    __ str(r0, Address(to(), _stack_offset));
+    _stack_offset += wordSize;
+    _num_int_args++;
+    break;
+  }
+}
+
+void InterpreterRuntime::SignatureHandlerGenerator::pass_float() {
+  const Address src(from(), Interpreter::local_offset_in_bytes(offset()));
+
+  if (_num_fp_args < Argument::n_float_register_parameters_c) {
+    __ ldrs(as_FloatRegister(_num_fp_args++), src);
+  } else {
+    __ ldrh(r0, src);
+    __ strh(r0, Address(to(), _stack_offset));
+    _stack_offset += wordSize;
+    _num_fp_args++;
+  }
+}
+
+void InterpreterRuntime::SignatureHandlerGenerator::pass_double() {
+  const Address src(from(), Interpreter::local_offset_in_bytes(offset() + 1));
+
+  if (_num_fp_args < Argument::n_float_register_parameters_c) {
+    __ ldrd(as_FloatRegister(_num_fp_args++), src);
+  } else {
+    __ ldr(r0, src);
+    __ str(r0, Address(to(), _stack_offset));
+    _stack_offset += wordSize;
+    _num_fp_args++;
+  }
+}
+
+void InterpreterRuntime::SignatureHandlerGenerator::pass_object() {
+
+  switch (_num_int_args) {
+  case 0:
+    assert(offset() == 0, "argument register 1 can only be (non-null) receiver");
+    __ add(c_rarg1, from(), Interpreter::local_offset_in_bytes(offset()));
+    _num_int_args++;
+    break;
+  case 1:
+    {
+      __ add(r0, from(), Interpreter::local_offset_in_bytes(offset()));
+      __ mov(c_rarg2, 0);
+      __ ldr(temp(), r0);
+      Label L;
+      __ cbz(temp(), L);
+      __ mov(c_rarg2, r0);
+      __ bind(L);
+      _num_int_args++;
+      break;
+    }
+  case 2:
+    {
+      __ add(r0, from(), Interpreter::local_offset_in_bytes(offset()));
+      __ mov(c_rarg3, 0);
+      __ ldr(temp(), r0);
+      Label L;
+      __ cbz(temp(), L);
+      __ mov(c_rarg3, r0);
+      __ bind(L);
+      _num_int_args++;
+      break;
+    }
+  case 3:
+    {
+      __ add(r0, from(), Interpreter::local_offset_in_bytes(offset()));
+      __ mov(c_rarg4, 0);
+      __ ldr(temp(), r0);
+      Label L;
+      __ cbz(temp(), L);
+      __ mov(c_rarg4, r0);
+      __ bind(L);
+      _num_int_args++;
+      break;
+    }
+  case 4:
+    {
+      __ add(r0, from(), Interpreter::local_offset_in_bytes(offset()));
+      __ mov(c_rarg5, 0);
+      __ ldr(temp(), r0);
+      Label L;
+      __ cbz(temp(), L);
+      __ mov(c_rarg5, r0);
+      __ bind(L);
+      _num_int_args++;
+      break;
+    }
+  case 5:
+    {
+      __ add(r0, from(), Interpreter::local_offset_in_bytes(offset()));
+      __ mov(c_rarg6, 0);
+      __ ldr(temp(), r0);
+      Label L;
+      __ cbz(temp(), L);
+      __ mov(c_rarg6, r0);
+      __ bind(L);
+      _num_int_args++;
+      break;
+    }
+  case 6:
+    {
+      __ add(r0, from(), Interpreter::local_offset_in_bytes(offset()));
+      __ mov(c_rarg7, 0);
+      __ ldr(temp(), r0);
+      Label L;
+      __ cbz(temp(), L);
+      __ mov(c_rarg7, r0);
+      __ bind(L);
+      _num_int_args++;
+      break;
+    }
+ default:
+   {
+      __ add(r0, from(), Interpreter::local_offset_in_bytes(offset()));
+      __ ldr(temp(), r0);
+      Label L;
+      __ cbnz(temp(), L);
+      __ mov(r0, zr);
+      __ bind(L);
+      __ str(r0, Address(to(), _stack_offset));
+      _stack_offset += wordSize;
+      _num_int_args++;
+      break;
+   }
+  }
+}
+
+void InterpreterRuntime::SignatureHandlerGenerator::generate(uint64_t fingerprint) {
+  // generate code to handle arguments
+  iterate(fingerprint);
+
+  // set the call format
+  // n.b. allow extra 1 for the JNI_Env in c_rarg0
+  unsigned int call_format = ((_num_int_args + 1) << 6) | (_num_fp_args << 2);
+
+  switch (method()->result_type()) {
+  case T_VOID:
+    call_format |= MacroAssembler::ret_type_void;
+    break;
+  case T_FLOAT:
+    call_format |= MacroAssembler::ret_type_float;
+    break;
+  case T_DOUBLE:
+    call_format |= MacroAssembler::ret_type_double;
+    break;
+  default:
+    call_format |= MacroAssembler::ret_type_integral;
+    break;
+  }
+
+  // // store the call format in the method
+  // __ movw(r0, call_format);
+  // __ str(r0, Address(rmethod, Method::call_format_offset()));
+
+  // return result handler
+  __ lea(r0, ExternalAddress(Interpreter::result_handler(method()->result_type())));
+  __ ret(lr);
+
+  __ flush();
+}
+
+
+// Implementation of SignatureHandlerLibrary
+
+void SignatureHandlerLibrary::pd_set_handler(address handler) {}
+
+
+class SlowSignatureHandler
+  : public NativeSignatureIterator {
+ private:
+  address   _from;
+  intptr_t* _to;
+  intptr_t* _int_args;
+  intptr_t* _fp_args;
+  intptr_t* _fp_identifiers;
+  unsigned int _num_int_args;
+  unsigned int _num_fp_args;
+
+  virtual void pass_int()
+  {
+    jint from_obj = *(jint *)(_from+Interpreter::local_offset_in_bytes(0));
+    _from -= Interpreter::stackElementSize;
+
+    if (_num_int_args < Argument::n_int_register_parameters_c-1) {
+      *_int_args++ = from_obj;
+      _num_int_args++;
+    } else {
+      *_to++ = from_obj;
+      _num_int_args++;
+    }
+  }
+
+  virtual void pass_long()
+  {
+    intptr_t from_obj = *(intptr_t*)(_from+Interpreter::local_offset_in_bytes(1));
+    _from -= 2*Interpreter::stackElementSize;
+
+    if (_num_int_args < Argument::n_int_register_parameters_c-1) {
+      *_int_args++ = from_obj;
+      _num_int_args++;
+    } else {
+      *_to++ = from_obj;
+      _num_int_args++;
+    }
+  }
+
+  virtual void pass_object()
+  {
+    intptr_t *from_addr = (intptr_t*)(_from + Interpreter::local_offset_in_bytes(0));
+    _from -= Interpreter::stackElementSize;
+
+    if (_num_int_args < Argument::n_int_register_parameters_c-1) {
+      *_int_args++ = (*from_addr == 0) ? NULL : (intptr_t)from_addr;
+      _num_int_args++;
+    } else {
+      *_to++ = (*from_addr == 0) ? NULL : (intptr_t) from_addr;
+      _num_int_args++;
+    }
+  }
+
+  virtual void pass_float()
+  {
+    jint from_obj = *(jint*)(_from+Interpreter::local_offset_in_bytes(0));
+    _from -= Interpreter::stackElementSize;
+
+    if (_num_fp_args < Argument::n_float_register_parameters_c) {
+      *_fp_args++ = from_obj;
+      _num_fp_args++;
+    } else {
+      *_to++ = from_obj;
+      _num_int_args++;
+    }
+  }
+
+  virtual void pass_double()
+  {
+    intptr_t from_obj = *(intptr_t*)(_from+Interpreter::local_offset_in_bytes(1));
+    _from -= 2*Interpreter::stackElementSize;
+
+    if (_num_fp_args < Argument::n_float_register_parameters_c) {
+      *_fp_args++ = from_obj;
+      *_fp_identifiers |= (1 << _num_fp_args); // mark as double
+      _num_fp_args++;
+    } else {
+      *_to++ = from_obj;
+      _num_int_args++;
+    }
+  }
+
+ public:
+  SlowSignatureHandler(methodHandle method, address from, intptr_t* to)
+    : NativeSignatureIterator(method)
+  {
+    _from = from;
+    _to   = to;
+
+    _int_args = to - (method->is_static() ? 16 : 17);
+    _fp_args =  to - 8;
+    _fp_identifiers = to - 9;
+    *(int*) _fp_identifiers = 0;
+    _num_int_args = (method->is_static() ? 1 : 0);
+    _num_fp_args = 0;
+  }
+
+  // n.b. allow extra 1 for the JNI_Env in c_rarg0
+  unsigned int get_call_format()
+  {
+    unsigned int call_format = ((_num_int_args + 1) << 6) | (_num_fp_args << 2);
+
+    switch (method()->result_type()) {
+    case T_VOID:
+      call_format |= MacroAssembler::ret_type_void;
+      break;
+    case T_FLOAT:
+      call_format |= MacroAssembler::ret_type_float;
+      break;
+    case T_DOUBLE:
+      call_format |= MacroAssembler::ret_type_double;
+      break;
+    default:
+      call_format |= MacroAssembler::ret_type_integral;
+      break;
+    }
+
+    return call_format;
+  }
+};
+
+
+IRT_ENTRY(address,
+          InterpreterRuntime::slow_signature_handler(JavaThread* thread,
+                                                     Method* method,
+                                                     intptr_t* from,
+                                                     intptr_t* to))
+  methodHandle m(thread, (Method*)method);
+  assert(m->is_native(), "sanity check");
+
+  // handle arguments
+  SlowSignatureHandler ssh(m, (address)from, to);
+  ssh.iterate(UCONST64(-1));
+
+  // // set the call format
+  // method->set_call_format(ssh.get_call_format());
+
+  // return result handler
+  return Interpreter::result_handler(m->result_type());
+IRT_END

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/interpreterRT_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_INTERPRETERRT_AARCH64_HPP
+#define CPU_AARCH64_VM_INTERPRETERRT_AARCH64_HPP
+
+#include "asm/macroAssembler.hpp"
+#include "memory/allocation.hpp"
+
+// native method calls
+
+class SignatureHandlerGenerator: public NativeSignatureIterator {
+ private:
+  MacroAssembler* _masm;
+  unsigned int _call_format;
+  unsigned int _num_fp_args;
+  unsigned int _num_int_args;
+  int _stack_offset;
+
+  void pass_int();
+  void pass_long();
+  void pass_float();
+  void pass_double();
+  void pass_object();
+
+ public:
+  // Creation
+  SignatureHandlerGenerator(methodHandle method, CodeBuffer* buffer) : NativeSignatureIterator(method) {
+    _masm = new MacroAssembler(buffer);
+    _num_int_args = (method->is_static() ? 1 : 0);
+    _num_fp_args = 0;
+    _stack_offset = 0;
+  }
+
+  // Code generation
+  void generate(uint64_t fingerprint);
+
+  // Code generation support
+  static Register from();
+  static Register to();
+  static Register temp();
+};
+
+#endif // CPU_AARCH64_VM_INTERPRETERRT_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/interpreter_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2003, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "interpreter/bytecodeHistogram.hpp"
+#include "interpreter/interpreter.hpp"
+#include "interpreter/interpreterGenerator.hpp"
+#include "interpreter/interpreterRuntime.hpp"
+#include "interpreter/interp_masm.hpp"
+#include "interpreter/templateTable.hpp"
+#include "oops/arrayOop.hpp"
+#include "oops/methodData.hpp"
+#include "oops/method.hpp"
+#include "oops/oop.inline.hpp"
+#include "prims/jvmtiExport.hpp"
+#include "prims/jvmtiThreadState.hpp"
+#include "prims/methodHandles.hpp"
+#include "runtime/arguments.hpp"
+#include "runtime/deoptimization.hpp"
+#include "runtime/frame.inline.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "runtime/synchronizer.hpp"
+#include "runtime/timer.hpp"
+#include "runtime/vframeArray.hpp"
+#include "utilities/debug.hpp"
+#ifdef COMPILER1
+#include "c1/c1_Runtime1.hpp"
+#endif
+
+#define __ _masm->
+
+
+address AbstractInterpreterGenerator::generate_slow_signature_handler() {
+  address entry = __ pc();
+
+  __ andr(esp, esp, -16);
+  __ mov(c_rarg3, esp);
+  // rmethod
+  // rlocals
+  // c_rarg3: first stack arg - wordSize
+
+  // adjust sp
+  __ sub(sp, c_rarg3, 18 * wordSize);
+  __ str(lr, Address(__ pre(sp, -2 * wordSize)));
+  __ call_VM(noreg,
+             CAST_FROM_FN_PTR(address,
+                              InterpreterRuntime::slow_signature_handler),
+             rmethod, rlocals, c_rarg3);
+
+  // r0: result handler
+
+  // Stack layout:
+  // rsp: return address           <- sp
+  //      1 garbage
+  //      8 integer args (if static first is unused)
+  //      1 float/double identifiers
+  //      8 double args
+  //        stack args              <- esp
+  //        garbage
+  //        expression stack bottom
+  //        bcp (NULL)
+  //        ...
+
+  // Restore LR
+  __ ldr(lr, Address(__ post(sp, 2 * wordSize)));
+
+  // Do FP first so we can use c_rarg3 as temp
+  __ ldrw(c_rarg3, Address(sp, 9 * wordSize)); // float/double identifiers
+
+  for (int i = 0; i < Argument::n_float_register_parameters_c; i++) {
+    const FloatRegister r = as_FloatRegister(i);
+
+    Label d, done;
+
+    __ tbnz(c_rarg3, i, d);
+    __ ldrs(r, Address(sp, (10 + i) * wordSize));
+    __ b(done);
+    __ bind(d);
+    __ ldrd(r, Address(sp, (10 + i) * wordSize));
+    __ bind(done);
+  }
+
+  // c_rarg0 contains the result from the call of
+  // InterpreterRuntime::slow_signature_handler so we don't touch it
+  // here.  It will be loaded with the JNIEnv* later.
+  __ ldr(c_rarg1, Address(sp, 1 * wordSize));
+  for (int i = c_rarg2->encoding(); i <= c_rarg7->encoding(); i += 2) {
+    Register rm = as_Register(i), rn = as_Register(i+1);
+    __ ldp(rm, rn, Address(sp, i * wordSize));
+  }
+
+  __ add(sp, sp, 18 * wordSize);
+  __ ret(lr);
+
+  return entry;
+}
+
+
+//
+// Various method entries
+//
+
+address InterpreterGenerator::generate_math_entry(AbstractInterpreter::MethodKind kind) {
+  // rmethod: Method*
+  // r13: sender sp
+  // esp: args
+
+  if (!InlineIntrinsics) return NULL; // Generate a vanilla entry
+
+  // These don't need a safepoint check because they aren't virtually
+  // callable. We won't enter these intrinsics from compiled code.
+  // If in the future we added an intrinsic which was virtually callable
+  // we'd have to worry about how to safepoint so that this code is used.
+
+  // mathematical functions inlined by compiler
+  // (interpreter must provide identical implementation
+  // in order to avoid monotonicity bugs when switching
+  // from interpreter to compiler in the middle of some
+  // computation)
+  //
+  // stack:
+  //        [ arg ] <-- esp
+  //        [ arg ]
+  // retaddr in lr
+
+  address entry_point = NULL;
+  Register continuation = lr;
+  switch (kind) {
+  case Interpreter::java_lang_math_abs:
+    entry_point = __ pc();
+    __ ldrd(v0, Address(esp));
+    __ fabsd(v0, v0);
+    __ mov(sp, r13); // Restore caller's SP
+    break;
+  case Interpreter::java_lang_math_sqrt:
+    entry_point = __ pc();
+    __ ldrd(v0, Address(esp));
+    __ fsqrtd(v0, v0);
+    __ mov(sp, r13);
+    break;
+  case Interpreter::java_lang_math_sin :
+  case Interpreter::java_lang_math_cos :
+  case Interpreter::java_lang_math_tan :
+  case Interpreter::java_lang_math_log :
+  case Interpreter::java_lang_math_log10 :
+  case Interpreter::java_lang_math_exp :
+    entry_point = __ pc();
+    __ ldrd(v0, Address(esp));
+    __ mov(sp, r13);
+    __ mov(r19, lr);
+    continuation = r19;  // The first callee-saved register
+    generate_transcendental_entry(kind, 1);
+    break;
+  case Interpreter::java_lang_math_pow :
+    entry_point = __ pc();
+    __ mov(r19, lr);
+    continuation = r19;
+    __ ldrd(v0, Address(esp, 2 * Interpreter::stackElementSize));
+    __ ldrd(v1, Address(esp));
+    __ mov(sp, r13);
+    generate_transcendental_entry(kind, 2);
+    break;
+  default:
+    ;
+  }
+  if (entry_point) {
+    __ br(continuation);
+  }
+
+  return entry_point;
+}
+
+  // double trigonometrics and transcendentals
+  // static jdouble dsin(jdouble x);
+  // static jdouble dcos(jdouble x);
+  // static jdouble dtan(jdouble x);
+  // static jdouble dlog(jdouble x);
+  // static jdouble dlog10(jdouble x);
+  // static jdouble dexp(jdouble x);
+  // static jdouble dpow(jdouble x, jdouble y);
+
+void InterpreterGenerator::generate_transcendental_entry(AbstractInterpreter::MethodKind kind, int fpargs) {
+  address fn;
+  switch (kind) {
+  case Interpreter::java_lang_math_sin :
+    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dsin);
+    break;
+  case Interpreter::java_lang_math_cos :
+    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dcos);
+    break;
+  case Interpreter::java_lang_math_tan :
+    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dtan);
+    break;
+  case Interpreter::java_lang_math_log :
+    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dlog);
+    break;
+  case Interpreter::java_lang_math_log10 :
+    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dlog10);
+    break;
+  case Interpreter::java_lang_math_exp :
+    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dexp);
+    break;
+  case Interpreter::java_lang_math_pow :
+    fpargs = 2;
+    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dpow);
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+  const int gpargs = 0, rtype = 3;
+  __ mov(rscratch1, fn);
+  __ blrt(rscratch1, gpargs, fpargs, rtype);
+}
+
+// Jump into normal path for accessor and empty entry to jump to normal entry
+// The "fast" optimization don't update compilation count therefore can disable inlining
+// for these functions that should be inlined.
+address InterpreterGenerator::generate_jump_to_normal_entry(void) {
+  address entry_point = __ pc();
+
+  assert(Interpreter::entry_for_kind(Interpreter::zerolocals) != NULL, "should already be generated");
+  __ b(Interpreter::entry_for_kind(Interpreter::zerolocals));
+  return entry_point;
+}
+
+// Abstract method entry
+// Attempt to execute abstract method. Throw exception
+address InterpreterGenerator::generate_abstract_entry(void) {
+  // rmethod: Method*
+  // r13: sender SP
+
+  address entry_point = __ pc();
+
+  // abstract method entry
+
+  //  pop return address, reset last_sp to NULL
+  __ empty_expression_stack();
+  __ restore_bcp();      // bcp must be correct for exception handler   (was destroyed)
+  __ restore_locals();   // make sure locals pointer is correct as well (was destroyed)
+
+  // throw exception
+  __ call_VM(noreg, CAST_FROM_FN_PTR(address,
+                             InterpreterRuntime::throw_AbstractMethodError));
+  // the call_VM checks for exception, so we should never return here.
+  __ should_not_reach_here();
+
+  return entry_point;
+}
+
+
+void Deoptimization::unwind_callee_save_values(frame* f, vframeArray* vframe_array) {
+
+  // This code is sort of the equivalent of C2IAdapter::setup_stack_frame back in
+  // the days we had adapter frames. When we deoptimize a situation where a
+  // compiled caller calls a compiled caller will have registers it expects
+  // to survive the call to the callee. If we deoptimize the callee the only
+  // way we can restore these registers is to have the oldest interpreter
+  // frame that we create restore these values. That is what this routine
+  // will accomplish.
+
+  // At the moment we have modified c2 to not have any callee save registers
+  // so this problem does not exist and this routine is just a place holder.
+
+  assert(f->is_interpreted_frame(), "must be interpreted");
+}

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/interpreter_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_INTERPRETER_AARCH64_HPP
+#define CPU_AARCH64_VM_INTERPRETER_AARCH64_HPP
+
+ public:
+
+  // Offset from rsp (which points to the last stack element)
+  static int expr_offset_in_bytes(int i) { return stackElementSize * i; }
+
+  // Stack index relative to tos (which points at value)
+  static int expr_index_at(int i)        { return stackElementWords * i; }
+
+  // Already negated by c++ interpreter
+  static int local_index_at(int i) {
+    assert(i <= 0, "local direction already negated");
+    return stackElementWords * i;
+  }
+
+#endif // CPU_AARCH64_VM_INTERPRETER_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/javaFrameAnchor_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_JAVAFRAMEANCHOR_AARCH64_HPP
+#define CPU_AARCH64_VM_JAVAFRAMEANCHOR_AARCH64_HPP
+
+private:
+
+  // FP value associated with _last_Java_sp:
+  intptr_t* volatile        _last_Java_fp;           // pointer is volatile not what it points to
+
+public:
+  // Each arch must define reset, save, restore
+  // These are used by objects that only care about:
+  //  1 - initializing a new state (thread creation, javaCalls)
+  //  2 - saving a current state (javaCalls)
+  //  3 - restoring an old state (javaCalls)
+
+  void clear(void) {
+    // clearing _last_Java_sp must be first
+    _last_Java_sp = NULL;
+    OrderAccess::release();
+    _last_Java_fp = NULL;
+    _last_Java_pc = NULL;
+  }
+
+  void copy(JavaFrameAnchor* src) {
+    // In order to make sure the transition state is valid for "this"
+    // We must clear _last_Java_sp before copying the rest of the new data
+    //
+    // Hack Alert: Temporary bugfix for 4717480/4721647
+    // To act like previous version (pd_cache_state) don't NULL _last_Java_sp
+    // unless the value is changing
+    //
+    if (_last_Java_sp != src->_last_Java_sp) {
+      _last_Java_sp = NULL;
+      OrderAccess::release();
+    }
+    _last_Java_fp = src->_last_Java_fp;
+    _last_Java_pc = src->_last_Java_pc;
+    // Must be last so profiler will always see valid frame if has_last_frame() is true
+    _last_Java_sp = src->_last_Java_sp;
+  }
+
+  // Always walkable
+  bool walkable(void) { return true; }
+  // Never any thing to do since we are always walkable and can find address of return addresses
+  void make_walkable(JavaThread* thread) { }
+
+  intptr_t* last_Java_sp(void) const             { return _last_Java_sp; }
+
+  address last_Java_pc(void)                     { return _last_Java_pc; }
+
+private:
+
+  static ByteSize last_Java_fp_offset()          { return byte_offset_of(JavaFrameAnchor, _last_Java_fp); }
+
+public:
+
+  void set_last_Java_sp(intptr_t* sp)            { _last_Java_sp = sp; OrderAccess::release(); }
+
+  intptr_t*   last_Java_fp(void)                     { return _last_Java_fp; }
+  // Assert (last_Java_sp == NULL || fp == NULL)
+  void set_last_Java_fp(intptr_t* fp)                { OrderAccess::release(); _last_Java_fp = fp; }
+
+#endif // CPU_AARCH64_VM_JAVAFRAMEANCHOR_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/jniFastGetField_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "memory/resourceArea.hpp"
+#include "prims/jniFastGetField.hpp"
+#include "prims/jvm_misc.hpp"
+#include "runtime/safepoint.hpp"
+
+#define __ masm->
+
+#define BUFFER_SIZE 30*wordSize
+
+// Instead of issuing a LoadLoad barrier we create an address
+// dependency between loads; this might be more efficient.
+
+// Common register usage:
+// r0/v0:      result
+// c_rarg0:    jni env
+// c_rarg1:    obj
+// c_rarg2:    jfield id
+
+static const Register robj          = r3;
+static const Register rcounter      = r4;
+static const Register roffset       = r5;
+static const Register rcounter_addr = r6;
+static const Register result        = r7;
+
+address JNI_FastGetField::generate_fast_get_int_field0(BasicType type) {
+  const char *name;
+  switch (type) {
+    case T_BOOLEAN: name = "jni_fast_GetBooleanField"; break;
+    case T_BYTE:    name = "jni_fast_GetByteField";    break;
+    case T_CHAR:    name = "jni_fast_GetCharField";    break;
+    case T_SHORT:   name = "jni_fast_GetShortField";   break;
+    case T_INT:     name = "jni_fast_GetIntField";     break;
+    case T_LONG:    name = "jni_fast_GetLongField";    break;
+    case T_FLOAT:   name = "jni_fast_GetFloatField";   break;
+    case T_DOUBLE:  name = "jni_fast_GetDoubleField";  break;
+    default:        ShouldNotReachHere();
+  }
+  ResourceMark rm;
+  BufferBlob* blob = BufferBlob::create(name, BUFFER_SIZE);
+  CodeBuffer cbuf(blob);
+  MacroAssembler* masm = new MacroAssembler(&cbuf);
+  address fast_entry = __ pc();
+
+  Label slow;
+
+  unsigned long offset;
+  __ adrp(rcounter_addr,
+          SafepointSynchronize::safepoint_counter_addr(), offset);
+  Address safepoint_counter_addr(rcounter_addr, offset);
+  __ ldrw(rcounter, safepoint_counter_addr);
+  __ andw(rscratch1, rcounter, 1);
+  __ cbnzw(rscratch1, slow);
+  __ eor(robj, c_rarg1, rcounter);
+  __ eor(robj, robj, rcounter);               // obj, since
+                                              // robj ^ rcounter ^ rcounter == robj
+                                              // robj is address dependent on rcounter.
+  __ ldr(robj, Address(robj, 0));             // *obj
+  __ lsr(roffset, c_rarg2, 2);                // offset
+
+  assert(count < LIST_CAPACITY, "LIST_CAPACITY too small");
+  speculative_load_pclist[count] = __ pc();   // Used by the segfault handler
+  switch (type) {
+    case T_BOOLEAN: __ ldrb    (result, Address(robj, roffset)); break;
+    case T_BYTE:    __ ldrsb   (result, Address(robj, roffset)); break;
+    case T_CHAR:    __ ldrh    (result, Address(robj, roffset)); break;
+    case T_SHORT:   __ ldrsh   (result, Address(robj, roffset)); break;
+    case T_FLOAT:   __ ldrw    (result, Address(robj, roffset)); break;
+    case T_INT:     __ ldrsw   (result, Address(robj, roffset)); break;
+    case T_DOUBLE:
+    case T_LONG:    __ ldr     (result, Address(robj, roffset)); break;
+    default:        ShouldNotReachHere();
+  }
+
+  // counter_addr is address dependent on result.
+  __ eor(rcounter_addr, rcounter_addr, result);
+  __ eor(rcounter_addr, rcounter_addr, result);
+  __ ldrw(rscratch1, safepoint_counter_addr);
+  __ cmpw(rcounter, rscratch1);
+  __ br (Assembler::NE, slow);
+
+  switch (type) {
+    case T_FLOAT:   __ fmovs(v0, result); break;
+    case T_DOUBLE:  __ fmovd(v0, result); break;
+    default:        __ mov(r0, result);   break;
+  }
+  __ ret(lr);
+
+  slowcase_entry_pclist[count++] = __ pc();
+  __ bind(slow);
+  address slow_case_addr;
+  switch (type) {
+    case T_BOOLEAN: slow_case_addr = jni_GetBooleanField_addr(); break;
+    case T_BYTE:    slow_case_addr = jni_GetByteField_addr();    break;
+    case T_CHAR:    slow_case_addr = jni_GetCharField_addr();    break;
+    case T_SHORT:   slow_case_addr = jni_GetShortField_addr();   break;
+    case T_INT:     slow_case_addr = jni_GetIntField_addr();     break;
+    case T_LONG:    slow_case_addr = jni_GetLongField_addr();    break;
+    case T_FLOAT:   slow_case_addr = jni_GetFloatField_addr();   break;
+    case T_DOUBLE:  slow_case_addr = jni_GetDoubleField_addr();  break;
+    default:        ShouldNotReachHere();
+  }
+
+  {
+    __ enter();
+    __ lea(rscratch1, ExternalAddress(slow_case_addr));
+    __ blr(rscratch1);
+    __ maybe_isb();
+    __ leave();
+    __ ret(lr);
+  }
+  __ flush ();
+
+  return fast_entry;
+}
+
+address JNI_FastGetField::generate_fast_get_boolean_field() {
+  return generate_fast_get_int_field0(T_BOOLEAN);
+}
+
+address JNI_FastGetField::generate_fast_get_byte_field() {
+  return generate_fast_get_int_field0(T_BYTE);
+}
+
+address JNI_FastGetField::generate_fast_get_char_field() {
+  return generate_fast_get_int_field0(T_CHAR);
+}
+
+address JNI_FastGetField::generate_fast_get_short_field() {
+  return generate_fast_get_int_field0(T_SHORT);
+}
+
+address JNI_FastGetField::generate_fast_get_int_field() {
+  return generate_fast_get_int_field0(T_INT);
+}
+
+address JNI_FastGetField::generate_fast_get_long_field() {
+  return generate_fast_get_int_field0(T_LONG);
+}
+
+address JNI_FastGetField::generate_fast_get_float_field() {
+  return generate_fast_get_int_field0(T_FLOAT);
+}
+
+address JNI_FastGetField::generate_fast_get_double_field() {
+  return generate_fast_get_int_field0(T_DOUBLE);
+}
+

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/jniTypes_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_JNITYPES_AARCH64_HPP
+#define CPU_AARCH64_VM_JNITYPES_AARCH64_HPP
+
+#include "memory/allocation.hpp"
+#include "oops/oop.hpp"
+#include "prims/jni.h"
+
+// This file holds platform-dependent routines used to write primitive jni
+// types to the array of arguments passed into JavaCalls::call
+
+class JNITypes : AllStatic {
+  // These functions write a java primitive type (in native format)
+  // to a java stack slot array to be passed as an argument to JavaCalls:calls.
+  // I.e., they are functionally 'push' operations if they have a 'pos'
+  // formal parameter.  Note that jlong's and jdouble's are written
+  // _in reverse_ of the order in which they appear in the interpreter
+  // stack.  This is because call stubs (see stubGenerator_sparc.cpp)
+  // reverse the argument list constructed by JavaCallArguments (see
+  // javaCalls.hpp).
+
+public:
+  // Ints are stored in native format in one JavaCallArgument slot at *to.
+  static inline void    put_int(jint  from, intptr_t *to)           { *(jint *)(to +   0  ) =  from; }
+  static inline void    put_int(jint  from, intptr_t *to, int& pos) { *(jint *)(to + pos++) =  from; }
+  static inline void    put_int(jint *from, intptr_t *to, int& pos) { *(jint *)(to + pos++) = *from; }
+
+  // Longs are stored in native format in one JavaCallArgument slot at
+  // *(to+1).
+  static inline void put_long(jlong  from, intptr_t *to) {
+    *(jlong*) (to + 1) = from;
+  }
+
+  static inline void put_long(jlong  from, intptr_t *to, int& pos) {
+    *(jlong*) (to + 1 + pos) = from;
+    pos += 2;
+  }
+
+  static inline void put_long(jlong *from, intptr_t *to, int& pos) {
+    *(jlong*) (to + 1 + pos) = *from;
+    pos += 2;
+  }
+
+  // Oops are stored in native format in one JavaCallArgument slot at *to.
+  static inline void    put_obj(oop  from, intptr_t *to)           { *(oop *)(to +   0  ) =  from; }
+  static inline void    put_obj(oop  from, intptr_t *to, int& pos) { *(oop *)(to + pos++) =  from; }
+  static inline void    put_obj(oop *from, intptr_t *to, int& pos) { *(oop *)(to + pos++) = *from; }
+
+  // Floats are stored in native format in one JavaCallArgument slot at *to.
+  static inline void    put_float(jfloat  from, intptr_t *to)           { *(jfloat *)(to +   0  ) =  from;  }
+  static inline void    put_float(jfloat  from, intptr_t *to, int& pos) { *(jfloat *)(to + pos++) =  from; }
+  static inline void    put_float(jfloat *from, intptr_t *to, int& pos) { *(jfloat *)(to + pos++) = *from; }
+
+#undef _JNI_SLOT_OFFSET
+#define _JNI_SLOT_OFFSET 1
+  // Doubles are stored in native word format in one JavaCallArgument
+  // slot at *(to+1).
+  static inline void put_double(jdouble  from, intptr_t *to) {
+    *(jdouble*) (to + 1) = from;
+  }
+
+  static inline void put_double(jdouble  from, intptr_t *to, int& pos) {
+    *(jdouble*) (to + 1 + pos) = from;
+    pos += 2;
+  }
+
+  static inline void put_double(jdouble *from, intptr_t *to, int& pos) {
+    *(jdouble*) (to + 1 + pos) = *from;
+    pos += 2;
+  }
+
+  // The get_xxx routines, on the other hand, actually _do_ fetch
+  // java primitive types from the interpreter stack.
+  // No need to worry about alignment on Intel.
+  static inline jint    get_int   (intptr_t *from) { return *(jint *)   from; }
+  static inline jlong   get_long  (intptr_t *from) { return *(jlong *)  (from + _JNI_SLOT_OFFSET); }
+  static inline oop     get_obj   (intptr_t *from) { return *(oop *)    from; }
+  static inline jfloat  get_float (intptr_t *from) { return *(jfloat *) from; }
+  static inline jdouble get_double(intptr_t *from) { return *(jdouble *)(from + _JNI_SLOT_OFFSET); }
+#undef _JNI_SLOT_OFFSET
+};
+
+#endif // CPU_AARCH64_VM_JNITYPES_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/jni_aarch64.h	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+#ifndef _JAVASOFT_JNI_MD_H_
+#define _JAVASOFT_JNI_MD_H_
+
+#if defined(SOLARIS) || defined(LINUX) || defined(_ALLBSD_SOURCE)
+
+
+// Note: please do not change these without also changing jni_md.h in the JDK
+// repository
+#ifndef __has_attribute
+  #define __has_attribute(x) 0
+#endif
+#if (defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4) && (__GNUC_MINOR__ > 2))) || __has_attribute(visibility)
+  #define JNIEXPORT     __attribute__((visibility("default")))
+  #define JNIIMPORT     __attribute__((visibility("default")))
+#else
+  #define JNIEXPORT
+  #define JNIIMPORT
+#endif
+
+  #define JNICALL
+  typedef int jint;
+  typedef long jlong;
+
+#else
+  #define JNIEXPORT __declspec(dllexport)
+  #define JNIIMPORT __declspec(dllimport)
+  #define JNICALL __stdcall
+
+  typedef int jint;
+  typedef __int64 jlong;
+#endif
+
+typedef signed char jbyte;
+
+#endif /* !_JAVASOFT_JNI_MD_H_ */

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,4138 @@
+/*
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include <sys/types.h>
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "asm/assembler.inline.hpp"
+#include "interpreter/interpreter.hpp"
+
+#include "compiler/disassembler.hpp"
+#include "memory/resourceArea.hpp"
+#include "runtime/biasedLocking.hpp"
+#include "runtime/icache.hpp"
+#include "runtime/interfaceSupport.hpp"
+#include "runtime/sharedRuntime.hpp"
+
+// #include "gc_interface/collectedHeap.inline.hpp"
+// #include "interpreter/interpreter.hpp"
+// #include "memory/cardTableModRefBS.hpp"
+// #include "prims/methodHandles.hpp"
+// #include "runtime/biasedLocking.hpp"
+// #include "runtime/interfaceSupport.hpp"
+// #include "runtime/objectMonitor.hpp"
+// #include "runtime/os.hpp"
+// #include "runtime/sharedRuntime.hpp"
+// #include "runtime/stubRoutines.hpp"
+
+#if INCLUDE_ALL_GCS
+#include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
+#include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
+#include "gc_implementation/g1/heapRegion.hpp"
+#endif
+
+#ifdef PRODUCT
+#define BLOCK_COMMENT(str) /* nothing */
+#define STOP(error) stop(error)
+#else
+#define BLOCK_COMMENT(str) block_comment(str)
+#define STOP(error) block_comment(error); stop(error)
+#endif
+
+#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
+
+// Patch any kind of instruction; there may be several instructions.
+// Return the total length (in bytes) of the instructions.
+int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
+  int instructions = 1;
+  assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
+  long offset = (target - branch) >> 2;
+  unsigned insn = *(unsigned*)branch;
+  if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
+    // Load register (literal)
+    Instruction_aarch64::spatch(branch, 23, 5, offset);
+  } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
+    // Unconditional branch (immediate)
+    Instruction_aarch64::spatch(branch, 25, 0, offset);
+  } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
+    // Conditional branch (immediate)
+    Instruction_aarch64::spatch(branch, 23, 5, offset);
+  } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
+    // Compare & branch (immediate)
+    Instruction_aarch64::spatch(branch, 23, 5, offset);
+  } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
+    // Test & branch (immediate)
+    Instruction_aarch64::spatch(branch, 18, 5, offset);
+  } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
+    // PC-rel. addressing
+    offset = target-branch;
+    int shift = Instruction_aarch64::extract(insn, 31, 31);
+    if (shift) {
+      u_int64_t dest = (u_int64_t)target;
+      uint64_t pc_page = (uint64_t)branch >> 12;
+      uint64_t adr_page = (uint64_t)target >> 12;
+      unsigned offset_lo = dest & 0xfff;
+      offset = adr_page - pc_page;
+
+      // We handle 3 types of PC relative addressing
+      //   1 - adrp    Rx, target_page
+      //       ldr/str Ry, [Rx, #offset_in_page]
+      //   2 - adrp    Rx, target_page
+      //       add     Ry, Rx, #offset_in_page
+      //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
+      // In the first 2 cases we must check that Rx is the same in the adrp and the
+      // subsequent ldr/str or add instruction. Otherwise we could accidentally end
+      // up treating a type 3 relocation as a type 1 or 2 just because it happened
+      // to be followed by a random unrelated ldr/str or add instruction.
+      //
+      // In the case of a type 3 relocation, we know that these are only generated
+      // for the safepoint polling page, or for the card type byte map base so we
+      // assert as much and of course that the offset is 0.
+      //
+      unsigned insn2 = ((unsigned*)branch)[1];
+      if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
+                Instruction_aarch64::extract(insn, 4, 0) ==
+                        Instruction_aarch64::extract(insn2, 9, 5)) {
+        // Load/store register (unsigned immediate)
+        unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
+        Instruction_aarch64::patch(branch + sizeof (unsigned),
+                                    21, 10, offset_lo >> size);
+        guarantee(((dest >> size) << size) == dest, "misaligned target");
+        instructions = 2;
+      } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
+                Instruction_aarch64::extract(insn, 4, 0) ==
+                        Instruction_aarch64::extract(insn2, 4, 0)) {
+        // add (immediate)
+        Instruction_aarch64::patch(branch + sizeof (unsigned),
+                                   21, 10, offset_lo);
+        instructions = 2;
+      } else {
+        assert((jbyte *)target ==
+                ((CardTableModRefBS*)(Universe::heap()->barrier_set()))->byte_map_base ||
+               target == StubRoutines::crc_table_addr() ||
+               (address)target == os::get_polling_page(),
+               "adrp must be polling page or byte map base");
+        assert(offset_lo == 0, "offset must be 0 for polling page or byte map base");
+      }
+    }
+    int offset_lo = offset & 3;
+    offset >>= 2;
+    Instruction_aarch64::spatch(branch, 23, 5, offset);
+    Instruction_aarch64::patch(branch, 30, 29, offset_lo);
+  } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
+    u_int64_t dest = (u_int64_t)target;
+    // Move wide constant
+    assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
+    assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
+    Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
+    Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
+    Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
+    assert(target_addr_for_insn(branch) == target, "should be");
+    instructions = 3;
+  } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
+             Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
+    // nothing to do
+    assert(target == 0, "did not expect to relocate target for polling page load");
+  } else {
+    ShouldNotReachHere();
+  }
+  return instructions * NativeInstruction::instruction_size;
+}
+
+int MacroAssembler::patch_oop(address insn_addr, address o) {
+  int instructions;
+  unsigned insn = *(unsigned*)insn_addr;
+  assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
+
+  // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
+  // narrow OOPs by setting the upper 16 bits in the first
+  // instruction.
+  if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
+    // Move narrow OOP
+    narrowOop n = oopDesc::encode_heap_oop((oop)o);
+    Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
+    Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
+    instructions = 2;
+  } else {
+    // Move wide OOP
+    assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
+    uintptr_t dest = (uintptr_t)o;
+    Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
+    Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
+    Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
+    instructions = 3;
+  }
+  return instructions * NativeInstruction::instruction_size;
+}
+
+address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
+  long offset = 0;
+  if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
+    // Load register (literal)
+    offset = Instruction_aarch64::sextract(insn, 23, 5);
+    return address(((uint64_t)insn_addr + (offset << 2)));
+  } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
+    // Unconditional branch (immediate)
+    offset = Instruction_aarch64::sextract(insn, 25, 0);
+  } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
+    // Conditional branch (immediate)
+    offset = Instruction_aarch64::sextract(insn, 23, 5);
+  } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
+    // Compare & branch (immediate)
+    offset = Instruction_aarch64::sextract(insn, 23, 5);
+   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
+    // Test & branch (immediate)
+    offset = Instruction_aarch64::sextract(insn, 18, 5);
+  } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
+    // PC-rel. addressing
+    offset = Instruction_aarch64::extract(insn, 30, 29);
+    offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
+    int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
+    if (shift) {
+      offset <<= shift;
+      uint64_t target_page = ((uint64_t)insn_addr) + offset;
+      target_page &= ((uint64_t)-1) << shift;
+      // Return the target address for the following sequences
+      //   1 - adrp    Rx, target_page
+      //       ldr/str Ry, [Rx, #offset_in_page]
+      //   2 - adrp    Rx, target_page         ]
+      //       add     Ry, Rx, #offset_in_page
+      //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
+      //
+      // In the first two cases  we check that the register is the same and
+      // return the target_page + the offset within the page.
+      // Otherwise we assume it is a page aligned relocation and return
+      // the target page only. The only cases this is generated is for
+      // the safepoint polling page or for the card table byte map base so
+      // we assert as much.
+      //
+      unsigned insn2 = ((unsigned*)insn_addr)[1];
+      if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
+                Instruction_aarch64::extract(insn, 4, 0) ==
+                        Instruction_aarch64::extract(insn2, 9, 5)) {
+        // Load/store register (unsigned immediate)
+        unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
+        unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
+        return address(target_page + (byte_offset << size));
+      } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
+                Instruction_aarch64::extract(insn, 4, 0) ==
+                        Instruction_aarch64::extract(insn2, 4, 0)) {
+        // add (immediate)
+        unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
+        return address(target_page + byte_offset);
+      } else {
+        assert((jbyte *)target_page ==
+                ((CardTableModRefBS*)(Universe::heap()->barrier_set()))->byte_map_base ||
+               (address)target_page == os::get_polling_page(),
+               "adrp must be polling page or byte map base");
+        return (address)target_page;
+      }
+    } else {
+      ShouldNotReachHere();
+    }
+  } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
+    u_int32_t *insns = (u_int32_t *)insn_addr;
+    // Move wide constant: movz, movk, movk.  See movptr().
+    assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
+    assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
+    return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
+                   + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
+                   + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
+  } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
+             Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
+    return 0;
+  } else {
+    ShouldNotReachHere();
+  }
+  return address(((uint64_t)insn_addr + (offset << 2)));
+}
+
+void MacroAssembler::serialize_memory(Register thread, Register tmp) {
+  dsb(Assembler::SY);
+}
+
+
+void MacroAssembler::reset_last_Java_frame(bool clear_fp,
+                                           bool clear_pc) {
+  // we must set sp to zero to clear frame
+  str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
+  // must clear fp, so that compiled frames are not confused; it is
+  // possible that we need it only for debugging
+  if (clear_fp) {
+    str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
+  }
+
+  if (clear_pc) {
+    str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
+  }
+}
+
+// Calls to C land
+//
+// When entering C land, the rfp, & resp of the last Java frame have to be recorded
+// in the (thread-local) JavaThread object. When leaving C land, the last Java fp
+// has to be reset to 0. This is required to allow proper stack traversal.
+void MacroAssembler::set_last_Java_frame(Register last_java_sp,
+                                         Register last_java_fp,
+                                         Register last_java_pc,
+                                         Register scratch) {
+
+  if (last_java_pc->is_valid()) {
+      str(last_java_pc, Address(rthread,
+                                JavaThread::frame_anchor_offset()
+                                + JavaFrameAnchor::last_Java_pc_offset()));
+    }
+
+  // determine last_java_sp register
+  if (last_java_sp == sp) {
+    mov(scratch, sp);
+    last_java_sp = scratch;
+  } else if (!last_java_sp->is_valid()) {
+    last_java_sp = esp;
+  }
+
+  str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
+
+  // last_java_fp is optional
+  if (last_java_fp->is_valid()) {
+    str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
+  }
+}
+
+void MacroAssembler::set_last_Java_frame(Register last_java_sp,
+                                         Register last_java_fp,
+                                         address  last_java_pc,
+                                         Register scratch) {
+  if (last_java_pc != NULL) {
+    adr(scratch, last_java_pc);
+  } else {
+    // FIXME: This is almost never correct.  We should delete all
+    // cases of set_last_Java_frame with last_java_pc=NULL and use the
+    // correct return address instead.
+    adr(scratch, pc());
+  }
+
+  str(scratch, Address(rthread,
+                       JavaThread::frame_anchor_offset()
+                       + JavaFrameAnchor::last_Java_pc_offset()));
+
+  set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
+}
+
+void MacroAssembler::set_last_Java_frame(Register last_java_sp,
+                                         Register last_java_fp,
+                                         Label &L,
+                                         Register scratch) {
+  if (L.is_bound()) {
+    set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
+  } else {
+    InstructionMark im(this);
+    L.add_patch_at(code(), locator());
+    set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
+  }
+}
+
+void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
+  assert(ReservedCodeCacheSize < 4*G, "branch out of range");
+  assert(CodeCache::find_blob(entry.target()) != NULL,
+         "destination of far call not found in code cache");
+  if (far_branches()) {
+    unsigned long offset;
+    // We can use ADRP here because we know that the total size of
+    // the code cache cannot exceed 2Gb.
+    adrp(tmp, entry, offset);
+    add(tmp, tmp, offset);
+    if (cbuf) cbuf->set_insts_mark();
+    blr(tmp);
+  } else {
+    if (cbuf) cbuf->set_insts_mark();
+    bl(entry);
+  }
+}
+
+void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
+  assert(ReservedCodeCacheSize < 4*G, "branch out of range");
+  assert(CodeCache::find_blob(entry.target()) != NULL,
+         "destination of far call not found in code cache");
+  if (far_branches()) {
+    unsigned long offset;
+    // We can use ADRP here because we know that the total size of
+    // the code cache cannot exceed 2Gb.
+    adrp(tmp, entry, offset);
+    add(tmp, tmp, offset);
+    if (cbuf) cbuf->set_insts_mark();
+    br(tmp);
+  } else {
+    if (cbuf) cbuf->set_insts_mark();
+    b(entry);
+  }
+}
+
+int MacroAssembler::biased_locking_enter(Register lock_reg,
+                                         Register obj_reg,
+                                         Register swap_reg,
+                                         Register tmp_reg,
+                                         bool swap_reg_contains_mark,
+                                         Label& done,
+                                         Label* slow_case,
+                                         BiasedLockingCounters* counters) {
+  assert(UseBiasedLocking, "why call this otherwise?");
+  assert_different_registers(lock_reg, obj_reg, swap_reg);
+
+  if (PrintBiasedLockingStatistics && counters == NULL)
+    counters = BiasedLocking::counters();
+
+  bool need_tmp_reg = false;
+  if (tmp_reg == noreg) {
+    tmp_reg = rscratch2;
+  }
+  assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1);
+  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
+  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
+  Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
+  Address saved_mark_addr(lock_reg, 0);
+
+  // Biased locking
+  // See whether the lock is currently biased toward our thread and
+  // whether the epoch is still valid
+  // Note that the runtime guarantees sufficient alignment of JavaThread
+  // pointers to allow age to be placed into low bits
+  // First check to see whether biasing is even enabled for this object
+  Label cas_label;
+  int null_check_offset = -1;
+  if (!swap_reg_contains_mark) {
+    null_check_offset = offset();
+    ldr(swap_reg, mark_addr);
+  }
+  andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
+  cmp(tmp_reg, markOopDesc::biased_lock_pattern);
+  br(Assembler::NE, cas_label);
+  // The bias pattern is present in the object's header. Need to check
+  // whether the bias owner and the epoch are both still current.
+  load_prototype_header(tmp_reg, obj_reg);
+  orr(tmp_reg, tmp_reg, rthread);
+  eor(tmp_reg, swap_reg, tmp_reg);
+  andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
+  if (counters != NULL) {
+    Label around;
+    cbnz(tmp_reg, around);
+    atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1);
+    b(done);
+    bind(around);
+  } else {
+    cbz(tmp_reg, done);
+  }
+
+  Label try_revoke_bias;
+  Label try_rebias;
+
+  // At this point we know that the header has the bias pattern and
+  // that we are not the bias owner in the current epoch. We need to
+  // figure out more details about the state of the header in order to
+  // know what operations can be legally performed on the object's
+  // header.
+
+  // If the low three bits in the xor result aren't clear, that means
+  // the prototype header is no longer biased and we have to revoke
+  // the bias on this object.
+  andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
+  cbnz(rscratch1, try_revoke_bias);
+
+  // Biasing is still enabled for this data type. See whether the
+  // epoch of the current bias is still valid, meaning that the epoch
+  // bits of the mark word are equal to the epoch bits of the
+  // prototype header. (Note that the prototype header's epoch bits
+  // only change at a safepoint.) If not, attempt to rebias the object
+  // toward the current thread. Note that we must be absolutely sure
+  // that the current epoch is invalid in order to do this because
+  // otherwise the manipulations it performs on the mark word are
+  // illegal.
+  andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
+  cbnz(rscratch1, try_rebias);
+
+  // The epoch of the current bias is still valid but we know nothing
+  // about the owner; it might be set or it might be clear. Try to
+  // acquire the bias of the object using an atomic operation. If this
+  // fails we will go in to the runtime to revoke the object's bias.
+  // Note that we first construct the presumed unbiased header so we
+  // don't accidentally blow away another thread's valid bias.
+  {
+    Label here;
+    mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
+    andr(swap_reg, swap_reg, rscratch1);
+    orr(tmp_reg, swap_reg, rthread);
+    cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
+    // If the biasing toward our thread failed, this means that
+    // another thread succeeded in biasing it toward itself and we
+    // need to revoke that bias. The revocation will occur in the
+    // interpreter runtime in the slow case.
+    bind(here);
+    if (counters != NULL) {
+      atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
+                  tmp_reg, rscratch1);
+    }
+  }
+  b(done);
+
+  bind(try_rebias);
+  // At this point we know the epoch has expired, meaning that the
+  // current "bias owner", if any, is actually invalid. Under these
+  // circumstances _only_, we are allowed to use the current header's
+  // value as the comparison value when doing the cas to acquire the
+  // bias in the current epoch. In other words, we allow transfer of
+  // the bias from one thread to another directly in this situation.
+  //
+  // FIXME: due to a lack of registers we currently blow away the age
+  // bits in this situation. Should attempt to preserve them.
+  {
+    Label here;
+    load_prototype_header(tmp_reg, obj_reg);
+    orr(tmp_reg, rthread, tmp_reg);
+    cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
+    // If the biasing toward our thread failed, then another thread
+    // succeeded in biasing it toward itself and we need to revoke that
+    // bias. The revocation will occur in the runtime in the slow case.
+    bind(here);
+    if (counters != NULL) {
+      atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
+                  tmp_reg, rscratch1);
+    }
+  }
+  b(done);
+
+  bind(try_revoke_bias);
+  // The prototype mark in the klass doesn't have the bias bit set any
+  // more, indicating that objects of this data type are not supposed
+  // to be biased any more. We are going to try to reset the mark of
+  // this object to the prototype value and fall through to the
+  // CAS-based locking scheme. Note that if our CAS fails, it means
+  // that another thread raced us for the privilege of revoking the
+  // bias of this particular object, so it's okay to continue in the
+  // normal locking code.
+  //
+  // FIXME: due to a lack of registers we currently blow away the age
+  // bits in this situation. Should attempt to preserve them.
+  {
+    Label here, nope;
+    load_prototype_header(tmp_reg, obj_reg);
+    cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
+    bind(here);
+
+    // Fall through to the normal CAS-based lock, because no matter what
+    // the result of the above CAS, some thread must have succeeded in
+    // removing the bias bit from the object's header.
+    if (counters != NULL) {
+      atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
+                  rscratch1);
+    }
+    bind(nope);
+  }
+
+  bind(cas_label);
+
+  return null_check_offset;
+}
+
+void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
+  assert(UseBiasedLocking, "why call this otherwise?");
+
+  // Check for biased locking unlock case, which is a no-op
+  // Note: we do not have to check the thread ID for two reasons.
+  // First, the interpreter checks for IllegalMonitorStateException at
+  // a higher level. Second, if the bias was revoked while we held the
+  // lock, the object could not be rebiased toward another thread, so
+  // the bias bit would be clear.
+  ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
+  andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
+  cmp(temp_reg, markOopDesc::biased_lock_pattern);
+  br(Assembler::EQ, done);
+}
+
+
+// added to make this compile
+
+REGISTER_DEFINITION(Register, noreg);
+
+static void pass_arg0(MacroAssembler* masm, Register arg) {
+  if (c_rarg0 != arg ) {
+    masm->mov(c_rarg0, arg);
+  }
+}
+
+static void pass_arg1(MacroAssembler* masm, Register arg) {
+  if (c_rarg1 != arg ) {
+    masm->mov(c_rarg1, arg);
+  }
+}
+
+static void pass_arg2(MacroAssembler* masm, Register arg) {
+  if (c_rarg2 != arg ) {
+    masm->mov(c_rarg2, arg);
+  }
+}
+
+static void pass_arg3(MacroAssembler* masm, Register arg) {
+  if (c_rarg3 != arg ) {
+    masm->mov(c_rarg3, arg);
+  }
+}
+
+void MacroAssembler::call_VM_base(Register oop_result,
+                                  Register java_thread,
+                                  Register last_java_sp,
+                                  address  entry_point,
+                                  int      number_of_arguments,
+                                  bool     check_exceptions) {
+   // determine java_thread register
+  if (!java_thread->is_valid()) {
+    java_thread = rthread;
+  }
+
+  // determine last_java_sp register
+  if (!last_java_sp->is_valid()) {
+    last_java_sp = esp;
+  }
+
+  // debugging support
+  assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
+  assert(java_thread == rthread, "unexpected register");
+#ifdef ASSERT
+  // TraceBytecodes does not use r12 but saves it over the call, so don't verify
+  // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
+#endif // ASSERT
+
+  assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
+  assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
+
+  // push java thread (becomes first argument of C function)
+
+  mov(c_rarg0, java_thread);
+
+  // set last Java frame before call
+  assert(last_java_sp != rfp, "can't use rfp");
+
+  Label l;
+  set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
+
+  // do the call, remove parameters
+  MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
+
+  // reset last Java frame
+  // Only interpreter should have to clear fp
+  reset_last_Java_frame(true, true);
+
+   // C++ interp handles this in the interpreter
+  check_and_handle_popframe(java_thread);
+  check_and_handle_earlyret(java_thread);
+
+  if (check_exceptions) {
+    // check for pending exceptions (java_thread is set upon return)
+    ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
+    Label ok;
+    cbz(rscratch1, ok);
+    lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
+    br(rscratch1);
+    bind(ok);
+  }
+
+  // get oop result if there is one and reset the value in the thread
+  if (oop_result->is_valid()) {
+    get_vm_result(oop_result, java_thread);
+  }
+}
+
+void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
+  call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
+}
+
+// Maybe emit a call via a trampoline.  If the code cache is small
+// trampolines won't be emitted.
+
+void MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
+  assert(entry.rspec().type() == relocInfo::runtime_call_type
+         || entry.rspec().type() == relocInfo::opt_virtual_call_type
+         || entry.rspec().type() == relocInfo::static_call_type
+         || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
+
+  unsigned int start_offset = offset();
+  if (far_branches() && !Compile::current()->in_scratch_emit_size()) {
+    emit_trampoline_stub(offset(), entry.target());
+  }
+
+  if (cbuf) cbuf->set_insts_mark();
+  relocate(entry.rspec());
+  if (Assembler::reachable_from_branch_at(pc(), entry.target())) {
+    bl(entry.target());
+  } else {
+    bl(pc());
+  }
+}
+
+
+// Emit a trampoline stub for a call to a target which is too far away.
+//
+// code sequences:
+//
+// call-site:
+//   branch-and-link to <destination> or <trampoline stub>
+//
+// Related trampoline stub for this call site in the stub section:
+//   load the call target from the constant pool
+//   branch (LR still points to the call site above)
+
+void MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
+                                             address dest) {
+  address stub = start_a_stub(Compile::MAX_stubs_size/2);
+  if (stub == NULL) {
+    start_a_stub(Compile::MAX_stubs_size/2);
+    Compile::current()->env()->record_out_of_memory_failure();
+    return;
+  }
+
+  // Create a trampoline stub relocation which relates this trampoline stub
+  // with the call instruction at insts_call_instruction_offset in the
+  // instructions code-section.
+  align(wordSize);
+  relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
+                                            + insts_call_instruction_offset));
+  const int stub_start_offset = offset();
+
+  // Now, create the trampoline stub's code:
+  // - load the call
+  // - call
+  Label target;
+  ldr(rscratch1, target);
+  br(rscratch1);
+  bind(target);
+  assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
+         "should be");
+  emit_int64((int64_t)dest);
+
+  const address stub_start_addr = addr_at(stub_start_offset);
+
+  assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
+
+  end_a_stub();
+}
+
+void MacroAssembler::ic_call(address entry) {
+  RelocationHolder rh = virtual_call_Relocation::spec(pc());
+  // address const_ptr = long_constant((jlong)Universe::non_oop_word());
+  // unsigned long offset;
+  // ldr_constant(rscratch2, const_ptr);
+  movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
+  trampoline_call(Address(entry, rh));
+}
+
+// Implementation of call_VM versions
+
+void MacroAssembler::call_VM(Register oop_result,
+                             address entry_point,
+                             bool check_exceptions) {
+  call_VM_helper(oop_result, entry_point, 0, check_exceptions);
+}
+
+void MacroAssembler::call_VM(Register oop_result,
+                             address entry_point,
+                             Register arg_1,
+                             bool check_exceptions) {
+  pass_arg1(this, arg_1);
+  call_VM_helper(oop_result, entry_point, 1, check_exceptions);
+}
+
+void MacroAssembler::call_VM(Register oop_result,
+                             address entry_point,
+                             Register arg_1,
+                             Register arg_2,
+                             bool check_exceptions) {
+  assert(arg_1 != c_rarg2, "smashed arg");
+  pass_arg2(this, arg_2);
+  pass_arg1(this, arg_1);
+  call_VM_helper(oop_result, entry_point, 2, check_exceptions);
+}
+
+void MacroAssembler::call_VM(Register oop_result,
+                             address entry_point,
+                             Register arg_1,
+                             Register arg_2,
+                             Register arg_3,
+                             bool check_exceptions) {
+  assert(arg_1 != c_rarg3, "smashed arg");
+  assert(arg_2 != c_rarg3, "smashed arg");
+  pass_arg3(this, arg_3);
+
+  assert(arg_1 != c_rarg2, "smashed arg");
+  pass_arg2(this, arg_2);
+
+  pass_arg1(this, arg_1);
+  call_VM_helper(oop_result, entry_point, 3, check_exceptions);
+}
+
+void MacroAssembler::call_VM(Register oop_result,
+                             Register last_java_sp,
+                             address entry_point,
+                             int number_of_arguments,
+                             bool check_exceptions) {
+  call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
+}
+
+void MacroAssembler::call_VM(Register oop_result,
+                             Register last_java_sp,
+                             address entry_point,
+                             Register arg_1,
+                             bool check_exceptions) {
+  pass_arg1(this, arg_1);
+  call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
+}
+
+void MacroAssembler::call_VM(Register oop_result,
+                             Register last_java_sp,
+                             address entry_point,
+                             Register arg_1,
+                             Register arg_2,
+                             bool check_exceptions) {
+
+  assert(arg_1 != c_rarg2, "smashed arg");
+  pass_arg2(this, arg_2);
+  pass_arg1(this, arg_1);
+  call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
+}
+
+void MacroAssembler::call_VM(Register oop_result,
+                             Register last_java_sp,
+                             address entry_point,
+                             Register arg_1,
+                             Register arg_2,
+                             Register arg_3,
+                             bool check_exceptions) {
+  assert(arg_1 != c_rarg3, "smashed arg");
+  assert(arg_2 != c_rarg3, "smashed arg");
+  pass_arg3(this, arg_3);
+  assert(arg_1 != c_rarg2, "smashed arg");
+  pass_arg2(this, arg_2);
+  pass_arg1(this, arg_1);
+  call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
+}
+
+
+void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
+  ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
+  str(zr, Address(java_thread, JavaThread::vm_result_offset()));
+  verify_oop(oop_result, "broken oop in call_VM_base");
+}
+
+void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
+  ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
+  str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
+}
+
+void MacroAssembler::align(int modulus) {
+  while (offset() % modulus != 0) nop();
+}
+
+// these are no-ops overridden by InterpreterMacroAssembler
+
+void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
+
+void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
+
+
+RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
+                                                      Register tmp,
+                                                      int offset) {
+  intptr_t value = *delayed_value_addr;
+  if (value != 0)
+    return RegisterOrConstant(value + offset);
+
+  // load indirectly to solve generation ordering problem
+  ldr(tmp, ExternalAddress((address) delayed_value_addr));
+
+  if (offset != 0)
+    add(tmp, tmp, offset);
+
+  return RegisterOrConstant(tmp);
+}
+
+
+void MacroAssembler:: notify(int type) {
+  if (type == bytecode_start) {
+    // set_last_Java_frame(esp, rfp, (address)NULL);
+    Assembler:: notify(type);
+    // reset_last_Java_frame(true, false);
+  }
+  else
+    Assembler:: notify(type);
+}
+
+// Look up the method for a megamorphic invokeinterface call.
+// The target method is determined by <intf_klass, itable_index>.
+// The receiver klass is in recv_klass.
+// On success, the result will be in method_result, and execution falls through.
+// On failure, execution transfers to the given label.
+void MacroAssembler::lookup_interface_method(Register recv_klass,
+                                             Register intf_klass,
+                                             RegisterOrConstant itable_index,
+                                             Register method_result,
+                                             Register scan_temp,
+                                             Label& L_no_such_interface) {
+  assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
+  assert(itable_index.is_constant() || itable_index.as_register() == method_result,
+         "caller must use same register for non-constant itable index as for method");
+
+  // Compute start of first itableOffsetEntry (which is at the end of the vtable)
+  int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
+  int itentry_off = itableMethodEntry::method_offset_in_bytes();
+  int scan_step   = itableOffsetEntry::size() * wordSize;
+  int vte_size    = vtableEntry::size() * wordSize;
+  assert(vte_size == wordSize, "else adjust times_vte_scale");
+
+  ldrw(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize));
+
+  // %%% Could store the aligned, prescaled offset in the klassoop.
+  // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
+  lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
+  add(scan_temp, scan_temp, vtable_base);
+  if (HeapWordsPerLong > 1) {
+    // Round up to align_object_offset boundary
+    // see code for instanceKlass::start_of_itable!
+    round_to(scan_temp, BytesPerLong);
+  }
+
+  // Adjust recv_klass by scaled itable_index, so we can free itable_index.
+  assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
+  // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
+  lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
+  if (itentry_off)
+    add(recv_klass, recv_klass, itentry_off);
+
+  // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
+  //   if (scan->interface() == intf) {
+  //     result = (klass + scan->offset() + itable_index);
+  //   }
+  // }
+  Label search, found_method;
+
+  for (int peel = 1; peel >= 0; peel--) {
+    ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
+    cmp(intf_klass, method_result);
+
+    if (peel) {
+      br(Assembler::EQ, found_method);
+    } else {
+      br(Assembler::NE, search);
+      // (invert the test to fall through to found_method...)
+    }
+
+    if (!peel)  break;
+
+    bind(search);
+
+    // Check that the previous entry is non-null.  A null entry means that
+    // the receiver class doesn't implement the interface, and wasn't the
+    // same as when the caller was compiled.
+    cbz(method_result, L_no_such_interface);
+    add(scan_temp, scan_temp, scan_step);
+  }
+
+  bind(found_method);
+
+  // Got a hit.
+  ldr(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
+  ldr(method_result, Address(recv_klass, scan_temp));
+}
+
+// virtual method calling
+void MacroAssembler::lookup_virtual_method(Register recv_klass,
+                                           RegisterOrConstant vtable_index,
+                                           Register method_result) {
+  const int base = InstanceKlass::vtable_start_offset() * wordSize;
+  assert(vtableEntry::size() * wordSize == 8,
+         "adjust the scaling in the code below");
+  int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
+
+  if (vtable_index.is_register()) {
+    lea(method_result, Address(recv_klass,
+                               vtable_index.as_register(),
+                               Address::lsl(LogBytesPerWord)));
+    ldr(method_result, Address(method_result, vtable_offset_in_bytes));
+  } else {
+    vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
+    ldr(method_result, Address(recv_klass, vtable_offset_in_bytes));
+  }
+}
+
+void MacroAssembler::check_klass_subtype(Register sub_klass,
+                           Register super_klass,
+                           Register temp_reg,
+                           Label& L_success) {
+  Label L_failure;
+  check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
+  check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
+  bind(L_failure);
+}
+
+
+void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
+                                                   Register super_klass,
+                                                   Register temp_reg,
+                                                   Label* L_success,
+                                                   Label* L_failure,
+                                                   Label* L_slow_path,
+                                        RegisterOrConstant super_check_offset) {
+  assert_different_registers(sub_klass, super_klass, temp_reg);
+  bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
+  if (super_check_offset.is_register()) {
+    assert_different_registers(sub_klass, super_klass,
+                               super_check_offset.as_register());
+  } else if (must_load_sco) {
+    assert(temp_reg != noreg, "supply either a temp or a register offset");
+  }
+
+  Label L_fallthrough;
+  int label_nulls = 0;
+  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
+  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
+  if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
+  assert(label_nulls <= 1, "at most one NULL in the batch");
+
+  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
+  int sco_offset = in_bytes(Klass::super_check_offset_offset());
+  Address super_check_offset_addr(super_klass, sco_offset);
+
+  // Hacked jmp, which may only be used just before L_fallthrough.
+#define final_jmp(label)                                                \
+  if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
+  else                            b(label)                /*omit semi*/
+
+  // If the pointers are equal, we are done (e.g., String[] elements).
+  // This self-check enables sharing of secondary supertype arrays among
+  // non-primary types such as array-of-interface.  Otherwise, each such
+  // type would need its own customized SSA.
+  // We move this check to the front of the fast path because many
+  // type checks are in fact trivially successful in this manner,
+  // so we get a nicely predicted branch right at the start of the check.
+  cmp(sub_klass, super_klass);
+  br(Assembler::EQ, *L_success);
+
+  // Check the supertype display:
+  if (must_load_sco) {
+    ldrw(temp_reg, super_check_offset_addr);
+    super_check_offset = RegisterOrConstant(temp_reg);
+  }
+  Address super_check_addr(sub_klass, super_check_offset);
+  ldr(rscratch1, super_check_addr);
+  cmp(super_klass, rscratch1); // load displayed supertype
+
+  // This check has worked decisively for primary supers.
+  // Secondary supers are sought in the super_cache ('super_cache_addr').
+  // (Secondary supers are interfaces and very deeply nested subtypes.)
+  // This works in the same check above because of a tricky aliasing
+  // between the super_cache and the primary super display elements.
+  // (The 'super_check_addr' can address either, as the case requires.)
+  // Note that the cache is updated below if it does not help us find
+  // what we need immediately.
+  // So if it was a primary super, we can just fail immediately.
+  // Otherwise, it's the slow path for us (no success at this point).
+
+  if (super_check_offset.is_register()) {
+    br(Assembler::EQ, *L_success);
+    cmp(super_check_offset.as_register(), sc_offset);
+    if (L_failure == &L_fallthrough) {
+      br(Assembler::EQ, *L_slow_path);
+    } else {
+      br(Assembler::NE, *L_failure);
+      final_jmp(*L_slow_path);
+    }
+  } else if (super_check_offset.as_constant() == sc_offset) {
+    // Need a slow path; fast failure is impossible.
+    if (L_slow_path == &L_fallthrough) {
+      br(Assembler::EQ, *L_success);
+    } else {
+      br(Assembler::NE, *L_slow_path);
+      final_jmp(*L_success);
+    }
+  } else {
+    // No slow path; it's a fast decision.
+    if (L_failure == &L_fallthrough) {
+      br(Assembler::EQ, *L_success);
+    } else {
+      br(Assembler::NE, *L_failure);
+      final_jmp(*L_success);
+    }
+  }
+
+  bind(L_fallthrough);
+
+#undef final_jmp
+}
+
+// These two are taken from x86, but they look generally useful
+
+// scans count pointer sized words at [addr] for occurence of value,
+// generic
+void MacroAssembler::repne_scan(Register addr, Register value, Register count,
+                                Register scratch) {
+  Label Lloop, Lexit;
+  cbz(count, Lexit);
+  bind(Lloop);
+  ldr(scratch, post(addr, wordSize));
+  cmp(value, scratch);
+  br(EQ, Lexit);
+  sub(count, count, 1);
+  cbnz(count, Lloop);
+  bind(Lexit);
+}
+
+// scans count 4 byte words at [addr] for occurence of value,
+// generic
+void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
+                                Register scratch) {
+  Label Lloop, Lexit;
+  cbz(count, Lexit);
+  bind(Lloop);
+  ldrw(scratch, post(addr, wordSize));
+  cmpw(value, scratch);
+  br(EQ, Lexit);
+  sub(count, count, 1);
+  cbnz(count, Lloop);
+  bind(Lexit);
+}
+
+void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
+                                                   Register super_klass,
+                                                   Register temp_reg,
+                                                   Register temp2_reg,
+                                                   Label* L_success,
+                                                   Label* L_failure,
+                                                   bool set_cond_codes) {
+  assert_different_registers(sub_klass, super_klass, temp_reg);
+  if (temp2_reg != noreg)
+    assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
+#define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
+
+  Label L_fallthrough;
+  int label_nulls = 0;
+  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
+  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
+  assert(label_nulls <= 1, "at most one NULL in the batch");
+
+  // a couple of useful fields in sub_klass:
+  int ss_offset = in_bytes(Klass::secondary_supers_offset());
+  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
+  Address secondary_supers_addr(sub_klass, ss_offset);
+  Address super_cache_addr(     sub_klass, sc_offset);
+
+  BLOCK_COMMENT("check_klass_subtype_slow_path");
+
+  // Do a linear scan of the secondary super-klass chain.
+  // This code is rarely used, so simplicity is a virtue here.
+  // The repne_scan instruction uses fixed registers, which we must spill.
+  // Don't worry too much about pre-existing connections with the input regs.
+
+  assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
+  assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
+
+  // Get super_klass value into r0 (even if it was in r5 or r2).
+  RegSet pushed_registers;
+  if (!IS_A_TEMP(r2))    pushed_registers += r2;
+  if (!IS_A_TEMP(r5))    pushed_registers += r5;
+
+  if (super_klass != r0 || UseCompressedOops) {
+    if (!IS_A_TEMP(r0))   pushed_registers += r0;
+  }
+
+  push(pushed_registers, sp);
+
+#ifndef PRODUCT
+  mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
+  Address pst_counter_addr(rscratch2);
+  ldr(rscratch1, pst_counter_addr);
+  add(rscratch1, rscratch1, 1);
+  str(rscratch1, pst_counter_addr);
+#endif //PRODUCT
+
+  // We will consult the secondary-super array.
+  ldr(r5, secondary_supers_addr);
+  // Load the array length.
+  ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
+  // Skip to start of data.
+  add(r5, r5, Array<Klass*>::base_offset_in_bytes());
+
+  cmp(sp, zr); // Clear Z flag; SP is never zero
+  // Scan R2 words at [R5] for an occurrence of R0.
+  // Set NZ/Z based on last compare.
+  repne_scan(r5, r0, r2, rscratch1);
+
+  // Unspill the temp. registers:
+  pop(pushed_registers, sp);
+
+  br(Assembler::NE, *L_failure);
+
+  // Success.  Cache the super we found and proceed in triumph.
+  str(super_klass, super_cache_addr);
+
+  if (L_success != &L_fallthrough) {
+    b(*L_success);
+  }
+
+#undef IS_A_TEMP
+
+  bind(L_fallthrough);
+}
+
+
+void MacroAssembler::verify_oop(Register reg, const char* s) {
+  if (!VerifyOops) return;
+
+  // Pass register number to verify_oop_subroutine
+  const char* b = NULL;
+  {
+    ResourceMark rm;
+    stringStream ss;
+    ss.print("verify_oop: %s: %s", reg->name(), s);
+    b = code_string(ss.as_string());
+  }
+  BLOCK_COMMENT("verify_oop {");
+
+  stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
+  stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
+
+  mov(r0, reg);
+  mov(rscratch1, (address)b);
+
+  // call indirectly to solve generation ordering problem
+  lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
+  ldr(rscratch2, Address(rscratch2));
+  blr(rscratch2);
+
+  ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
+  ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
+
+  BLOCK_COMMENT("} verify_oop");
+}
+
+void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
+  if (!VerifyOops) return;
+
+  const char* b = NULL;
+  {
+    ResourceMark rm;
+    stringStream ss;
+    ss.print("verify_oop_addr: %s", s);
+    b = code_string(ss.as_string());
+  }
+  BLOCK_COMMENT("verify_oop_addr {");
+
+  stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
+  stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
+
+  // addr may contain sp so we will have to adjust it based on the
+  // pushes that we just did.
+  if (addr.uses(sp)) {
+    lea(r0, addr);
+    ldr(r0, Address(r0, 4 * wordSize));
+  } else {
+    ldr(r0, addr);
+  }
+  mov(rscratch1, (address)b);
+
+  // call indirectly to solve generation ordering problem
+  lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
+  ldr(rscratch2, Address(rscratch2));
+  blr(rscratch2);
+
+  ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
+  ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
+
+  BLOCK_COMMENT("} verify_oop_addr");
+}
+
+Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
+                                         int extra_slot_offset) {
+  // cf. TemplateTable::prepare_invoke(), if (load_receiver).
+  int stackElementSize = Interpreter::stackElementSize;
+  int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
+#ifdef ASSERT
+  int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
+  assert(offset1 - offset == stackElementSize, "correct arithmetic");
+#endif
+  if (arg_slot.is_constant()) {
+    return Address(esp, arg_slot.as_constant() * stackElementSize
+                   + offset);
+  } else {
+    add(rscratch1, esp, arg_slot.as_register(),
+        ext::uxtx, exact_log2(stackElementSize));
+    return Address(rscratch1, offset);
+  }
+}
+
+void MacroAssembler::call_VM_leaf_base(address entry_point,
+                                       int number_of_arguments,
+                                       Label *retaddr) {
+  call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
+}
+
+void MacroAssembler::call_VM_leaf_base1(address entry_point,
+                                        int number_of_gp_arguments,
+                                        int number_of_fp_arguments,
+                                        ret_type type,
+                                        Label *retaddr) {
+  Label E, L;
+
+  stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
+
+  // We add 1 to number_of_arguments because the thread in arg0 is
+  // not counted
+  mov(rscratch1, entry_point);
+  blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
+  if (retaddr)
+    bind(*retaddr);
+
+  ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
+  maybe_isb();
+}
+
+void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
+  call_VM_leaf_base(entry_point, number_of_arguments);
+}
+
+void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
+  pass_arg0(this, arg_0);
+  call_VM_leaf_base(entry_point, 1);
+}
+
+void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
+  pass_arg0(this, arg_0);
+  pass_arg1(this, arg_1);
+  call_VM_leaf_base(entry_point, 2);
+}
+
+void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
+                                  Register arg_1, Register arg_2) {
+  pass_arg0(this, arg_0);
+  pass_arg1(this, arg_1);
+  pass_arg2(this, arg_2);
+  call_VM_leaf_base(entry_point, 3);
+}
+
+void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
+  pass_arg0(this, arg_0);
+  MacroAssembler::call_VM_leaf_base(entry_point, 1);
+}
+
+void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
+
+  assert(arg_0 != c_rarg1, "smashed arg");
+  pass_arg1(this, arg_1);
+  pass_arg0(this, arg_0);
+  MacroAssembler::call_VM_leaf_base(entry_point, 2);
+}
+
+void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
+  assert(arg_0 != c_rarg2, "smashed arg");
+  assert(arg_1 != c_rarg2, "smashed arg");
+  pass_arg2(this, arg_2);
+  assert(arg_0 != c_rarg1, "smashed arg");
+  pass_arg1(this, arg_1);
+  pass_arg0(this, arg_0);
+  MacroAssembler::call_VM_leaf_base(entry_point, 3);
+}
+
+void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
+  assert(arg_0 != c_rarg3, "smashed arg");
+  assert(arg_1 != c_rarg3, "smashed arg");
+  assert(arg_2 != c_rarg3, "smashed arg");
+  pass_arg3(this, arg_3);
+  assert(arg_0 != c_rarg2, "smashed arg");
+  assert(arg_1 != c_rarg2, "smashed arg");
+  pass_arg2(this, arg_2);
+  assert(arg_0 != c_rarg1, "smashed arg");
+  pass_arg1(this, arg_1);
+  pass_arg0(this, arg_0);
+  MacroAssembler::call_VM_leaf_base(entry_point, 4);
+}
+
+void MacroAssembler::null_check(Register reg, int offset) {
+  if (needs_explicit_null_check(offset)) {
+    // provoke OS NULL exception if reg = NULL by
+    // accessing M[reg] w/o changing any registers
+    // NOTE: this is plenty to provoke a segv
+    ldr(zr, Address(reg));
+  } else {
+    // nothing to do, (later) access of M[reg + offset]
+    // will provoke OS NULL exception if reg = NULL
+  }
+}
+
+// MacroAssembler protected routines needed to implement
+// public methods
+
+void MacroAssembler::mov(Register r, Address dest) {
+  code_section()->relocate(pc(), dest.rspec());
+  u_int64_t imm64 = (u_int64_t)dest.target();
+  movptr(r, imm64);
+}
+
+// Move a constant pointer into r.  In AArch64 mode the virtual
+// address space is 48 bits in size, so we only need three
+// instructions to create a patchable instruction sequence that can
+// reach anywhere.
+void MacroAssembler::movptr(Register r, uintptr_t imm64) {
+#ifndef PRODUCT
+  {
+    char buffer[64];
+    snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
+    block_comment(buffer);
+  }
+#endif
+  assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
+  movz(r, imm64 & 0xffff);
+  imm64 >>= 16;
+  movk(r, imm64 & 0xffff, 16);
+  imm64 >>= 16;
+  movk(r, imm64 & 0xffff, 32);
+}
+
+void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
+{
+#ifndef PRODUCT
+  {
+    char buffer[64];
+    snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
+    block_comment(buffer);
+  }
+#endif
+  if (operand_valid_for_logical_immediate(false, imm64)) {
+    orr(dst, zr, imm64);
+  } else {
+    // we can use a combination of MOVZ or MOVN with
+    // MOVK to build up the constant
+    u_int64_t imm_h[4];
+    int zero_count = 0;
+    int neg_count = 0;
+    int i;
+    for (i = 0; i < 4; i++) {
+      imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
+      if (imm_h[i] == 0) {
+        zero_count++;
+      } else if (imm_h[i] == 0xffffL) {
+        neg_count++;
+      }
+    }
+    if (zero_count == 4) {
+      // one MOVZ will do
+      movz(dst, 0);
+    } else if (neg_count == 4) {
+      // one MOVN will do
+      movn(dst, 0);
+    } else if (zero_count == 3) {
+      for (i = 0; i < 4; i++) {
+        if (imm_h[i] != 0L) {
+          movz(dst, (u_int32_t)imm_h[i], (i << 4));
+          break;
+        }
+      }
+    } else if (neg_count == 3) {
+      // one MOVN will do
+      for (int i = 0; i < 4; i++) {
+        if (imm_h[i] != 0xffffL) {
+          movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
+          break;
+        }
+      }
+    } else if (zero_count == 2) {
+      // one MOVZ and one MOVK will do
+      for (i = 0; i < 3; i++) {
+        if (imm_h[i] != 0L) {
+          movz(dst, (u_int32_t)imm_h[i], (i << 4));
+          i++;
+          break;
+        }
+      }
+      for (;i < 4; i++) {
+        if (imm_h[i] != 0L) {
+          movk(dst, (u_int32_t)imm_h[i], (i << 4));
+        }
+      }
+    } else if (neg_count == 2) {
+      // one MOVN and one MOVK will do
+      for (i = 0; i < 4; i++) {
+        if (imm_h[i] != 0xffffL) {
+          movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
+          i++;
+          break;
+        }
+      }
+      for (;i < 4; i++) {
+        if (imm_h[i] != 0xffffL) {
+          movk(dst, (u_int32_t)imm_h[i], (i << 4));
+        }
+      }
+    } else if (zero_count == 1) {
+      // one MOVZ and two MOVKs will do
+      for (i = 0; i < 4; i++) {
+        if (imm_h[i] != 0L) {
+          movz(dst, (u_int32_t)imm_h[i], (i << 4));
+          i++;
+          break;
+        }
+      }
+      for (;i < 4; i++) {
+        if (imm_h[i] != 0x0L) {
+          movk(dst, (u_int32_t)imm_h[i], (i << 4));
+        }
+      }
+    } else if (neg_count == 1) {
+      // one MOVN and two MOVKs will do
+      for (i = 0; i < 4; i++) {
+        if (imm_h[i] != 0xffffL) {
+          movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
+          i++;
+          break;
+        }
+      }
+      for (;i < 4; i++) {
+        if (imm_h[i] != 0xffffL) {
+          movk(dst, (u_int32_t)imm_h[i], (i << 4));
+        }
+      }
+    } else {
+      // use a MOVZ and 3 MOVKs (makes it easier to debug)
+      movz(dst, (u_int32_t)imm_h[0], 0);
+      for (i = 1; i < 4; i++) {
+        movk(dst, (u_int32_t)imm_h[i], (i << 4));
+      }
+    }
+  }
+}
+
+void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
+{
+#ifndef PRODUCT
+    {
+      char buffer[64];
+      snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
+      block_comment(buffer);
+    }
+#endif
+  if (operand_valid_for_logical_immediate(true, imm32)) {
+    orrw(dst, zr, imm32);
+  } else {
+    // we can use MOVZ, MOVN or two calls to MOVK to build up the
+    // constant
+    u_int32_t imm_h[2];
+    imm_h[0] = imm32 & 0xffff;
+    imm_h[1] = ((imm32 >> 16) & 0xffff);
+    if (imm_h[0] == 0) {
+      movzw(dst, imm_h[1], 16);
+    } else if (imm_h[0] == 0xffff) {
+      movnw(dst, imm_h[1] ^ 0xffff, 16);
+    } else if (imm_h[1] == 0) {
+      movzw(dst, imm_h[0], 0);
+    } else if (imm_h[1] == 0xffff) {
+      movnw(dst, imm_h[0] ^ 0xffff, 0);
+    } else {
+      // use a MOVZ and MOVK (makes it easier to debug)
+      movzw(dst, imm_h[0], 0);
+      movkw(dst, imm_h[1], 16);
+    }
+  }
+}
+
+// Form an address from base + offset in Rd.  Rd may or may
+// not actually be used: you must use the Address that is returned.
+// It is up to you to ensure that the shift provided matches the size
+// of your data.
+Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
+  if (Address::offset_ok_for_immed(byte_offset, shift))
+    // It fits; no need for any heroics
+    return Address(base, byte_offset);
+
+  // Don't do anything clever with negative or misaligned offsets
+  unsigned mask = (1 << shift) - 1;
+  if (byte_offset < 0 || byte_offset & mask) {
+    mov(Rd, byte_offset);
+    add(Rd, base, Rd);
+    return Address(Rd);
+  }
+
+  // See if we can do this with two 12-bit offsets
+  {
+    unsigned long word_offset = byte_offset >> shift;
+    unsigned long masked_offset = word_offset & 0xfff000;
+    if (Address::offset_ok_for_immed(word_offset - masked_offset)
+        && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
+      add(Rd, base, masked_offset << shift);
+      word_offset -= masked_offset;
+      return Address(Rd, word_offset << shift);
+    }
+  }
+
+  // Do it the hard way
+  mov(Rd, byte_offset);
+  add(Rd, base, Rd);
+  return Address(Rd);
+}
+
+void MacroAssembler::atomic_incw(Register counter_addr, Register tmp) {
+  Label retry_load;
+  bind(retry_load);
+  // flush and load exclusive from the memory location
+  ldxrw(tmp, counter_addr);
+  addw(tmp, tmp, 1);
+  // if we store+flush with no intervening write tmp wil be zero
+  stxrw(tmp, tmp, counter_addr);
+  cbnzw(tmp, retry_load);
+}
+
+
+int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
+                                    bool want_remainder, Register scratch)
+{
+  // Full implementation of Java idiv and irem.  The function
+  // returns the (pc) offset of the div instruction - may be needed
+  // for implicit exceptions.
+  //
+  // constraint : ra/rb =/= scratch
+  //         normal case
+  //
+  // input : ra: dividend
+  //         rb: divisor
+  //
+  // result: either
+  //         quotient  (= ra idiv rb)
+  //         remainder (= ra irem rb)
+
+  assert(ra != scratch && rb != scratch, "reg cannot be scratch");
+
+  int idivl_offset = offset();
+  if (! want_remainder) {
+    sdivw(result, ra, rb);
+  } else {
+    sdivw(scratch, ra, rb);
+    msubw(result, scratch, rb, ra);
+  }
+
+  return idivl_offset;
+}
+
+int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
+                                    bool want_remainder, Register scratch)
+{
+  // Full implementation of Java ldiv and lrem.  The function
+  // returns the (pc) offset of the div instruction - may be needed
+  // for implicit exceptions.
+  //
+  // constraint : ra/rb =/= scratch
+  //         normal case
+  //
+  // input : ra: dividend
+  //         rb: divisor
+  //
+  // result: either
+  //         quotient  (= ra idiv rb)
+  //         remainder (= ra irem rb)
+
+  assert(ra != scratch && rb != scratch, "reg cannot be scratch");
+
+  int idivq_offset = offset();
+  if (! want_remainder) {
+    sdiv(result, ra, rb);
+  } else {
+    sdiv(scratch, ra, rb);
+    msub(result, scratch, rb, ra);
+  }
+
+  return idivq_offset;
+}
+
+// MacroAssembler routines found actually to be needed
+
+void MacroAssembler::push(Register src)
+{
+  str(src, Address(pre(esp, -1 * wordSize)));
+}
+
+void MacroAssembler::pop(Register dst)
+{
+  ldr(dst, Address(post(esp, 1 * wordSize)));
+}
+
+// Note: load_unsigned_short used to be called load_unsigned_word.
+int MacroAssembler::load_unsigned_short(Register dst, Address src) {
+  int off = offset();
+  ldrh(dst, src);
+  return off;
+}
+
+int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
+  int off = offset();
+  ldrb(dst, src);
+  return off;
+}
+
+int MacroAssembler::load_signed_short(Register dst, Address src) {
+  int off = offset();
+  ldrsh(dst, src);
+  return off;
+}
+
+int MacroAssembler::load_signed_byte(Register dst, Address src) {
+  int off = offset();
+  ldrsb(dst, src);
+  return off;
+}
+
+int MacroAssembler::load_signed_short32(Register dst, Address src) {
+  int off = offset();
+  ldrshw(dst, src);
+  return off;
+}
+
+int MacroAssembler::load_signed_byte32(Register dst, Address src) {
+  int off = offset();
+  ldrsbw(dst, src);
+  return off;
+}
+
+void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
+  switch (size_in_bytes) {
+  case  8:  ldr(dst, src); break;
+  case  4:  ldrw(dst, src); break;
+  case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
+  case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
+  default:  ShouldNotReachHere();
+  }
+}
+
+void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
+  switch (size_in_bytes) {
+  case  8:  str(src, dst); break;
+  case  4:  strw(src, dst); break;
+  case  2:  strh(src, dst); break;
+  case  1:  strb(src, dst); break;
+  default:  ShouldNotReachHere();
+  }
+}
+
+void MacroAssembler::decrementw(Register reg, int value)
+{
+  if (value < 0)  { incrementw(reg, -value);      return; }
+  if (value == 0) {                               return; }
+  if (value < (1 << 12)) { subw(reg, reg, value); return; }
+  /* else */ {
+    guarantee(reg != rscratch2, "invalid dst for register decrement");
+    movw(rscratch2, (unsigned)value);
+    subw(reg, reg, rscratch2);
+  }
+}
+
+void MacroAssembler::decrement(Register reg, int value)
+{
+  if (value < 0)  { increment(reg, -value);      return; }
+  if (value == 0) {                              return; }
+  if (value < (1 << 12)) { sub(reg, reg, value); return; }
+  /* else */ {
+    assert(reg != rscratch2, "invalid dst for register decrement");
+    mov(rscratch2, (unsigned long)value);
+    sub(reg, reg, rscratch2);
+  }
+}
+
+void MacroAssembler::decrementw(Address dst, int value)
+{
+  assert(!dst.uses(rscratch1), "invalid dst for address decrement");
+  ldrw(rscratch1, dst);
+  decrementw(rscratch1, value);
+  strw(rscratch1, dst);
+}
+
+void MacroAssembler::decrement(Address dst, int value)
+{
+  assert(!dst.uses(rscratch1), "invalid address for decrement");
+  ldr(rscratch1, dst);
+  decrement(rscratch1, value);
+  str(rscratch1, dst);
+}
+
+void MacroAssembler::incrementw(Register reg, int value)
+{
+  if (value < 0)  { decrementw(reg, -value);      return; }
+  if (value == 0) {                               return; }
+  if (value < (1 << 12)) { addw(reg, reg, value); return; }
+  /* else */ {
+    assert(reg != rscratch2, "invalid dst for register increment");
+    movw(rscratch2, (unsigned)value);
+    addw(reg, reg, rscratch2);
+  }
+}
+
+void MacroAssembler::increment(Register reg, int value)
+{
+  if (value < 0)  { decrement(reg, -value);      return; }
+  if (value == 0) {                              return; }
+  if (value < (1 << 12)) { add(reg, reg, value); return; }
+  /* else */ {
+    assert(reg != rscratch2, "invalid dst for register increment");
+    movw(rscratch2, (unsigned)value);
+    add(reg, reg, rscratch2);
+  }
+}
+
+void MacroAssembler::incrementw(Address dst, int value)
+{
+  assert(!dst.uses(rscratch1), "invalid dst for address increment");
+  ldrw(rscratch1, dst);
+  incrementw(rscratch1, value);
+  strw(rscratch1, dst);
+}
+
+void MacroAssembler::increment(Address dst, int value)
+{
+  assert(!dst.uses(rscratch1), "invalid dst for address increment");
+  ldr(rscratch1, dst);
+  increment(rscratch1, value);
+  str(rscratch1, dst);
+}
+
+
+void MacroAssembler::pusha() {
+  push(0x7fffffff, sp);
+}
+
+void MacroAssembler::popa() {
+  pop(0x7fffffff, sp);
+}
+
+// Push lots of registers in the bit set supplied.  Don't push sp.
+// Return the number of words pushed
+int MacroAssembler::push(unsigned int bitset, Register stack) {
+  int words_pushed = 0;
+
+  // Scan bitset to accumulate register pairs
+  unsigned char regs[32];
+  int count = 0;
+  for (int reg = 0; reg <= 30; reg++) {
+    if (1 & bitset)
+      regs[count++] = reg;
+    bitset >>= 1;
+  }
+  regs[count++] = zr->encoding_nocheck();
+  count &= ~1;  // Only push an even nuber of regs
+
+  if (count) {
+    stp(as_Register(regs[0]), as_Register(regs[1]),
+       Address(pre(stack, -count * wordSize)));
+    words_pushed += 2;
+  }
+  for (int i = 2; i < count; i += 2) {
+    stp(as_Register(regs[i]), as_Register(regs[i+1]),
+       Address(stack, i * wordSize));
+    words_pushed += 2;
+  }
+
+  assert(words_pushed == count, "oops, pushed != count");
+
+  return count;
+}
+
+int MacroAssembler::pop(unsigned int bitset, Register stack) {
+  int words_pushed = 0;
+
+  // Scan bitset to accumulate register pairs
+  unsigned char regs[32];
+  int count = 0;
+  for (int reg = 0; reg <= 30; reg++) {
+    if (1 & bitset)
+      regs[count++] = reg;
+    bitset >>= 1;
+  }
+  regs[count++] = zr->encoding_nocheck();
+  count &= ~1;
+
+  for (int i = 2; i < count; i += 2) {
+    ldp(as_Register(regs[i]), as_Register(regs[i+1]),
+       Address(stack, i * wordSize));
+    words_pushed += 2;
+  }
+  if (count) {
+    ldp(as_Register(regs[0]), as_Register(regs[1]),
+       Address(post(stack, count * wordSize)));
+    words_pushed += 2;
+  }
+
+  assert(words_pushed == count, "oops, pushed != count");
+
+  return count;
+}
+#ifdef ASSERT
+void MacroAssembler::verify_heapbase(const char* msg) {
+#if 0
+  assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
+  assert (Universe::heap() != NULL, "java heap should be initialized");
+  if (CheckCompressedOops) {
+    Label ok;
+    push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
+    cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
+    br(Assembler::EQ, ok);
+    stop(msg);
+    bind(ok);
+    pop(1 << rscratch1->encoding(), sp);
+  }
+#endif
+}
+#endif
+
+void MacroAssembler::stop(const char* msg) {
+  address ip = pc();
+  pusha();
+  mov(c_rarg0, (address)msg);
+  mov(c_rarg1, (address)ip);
+  mov(c_rarg2, sp);
+  mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
+  // call(c_rarg3);
+  blrt(c_rarg3, 3, 0, 1);
+  hlt(0);
+}
+
+// If a constant does not fit in an immediate field, generate some
+// number of MOV instructions and then perform the operation.
+void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
+                                           add_sub_imm_insn insn1,
+                                           add_sub_reg_insn insn2) {
+  assert(Rd != zr, "Rd = zr and not setting flags?");
+  if (operand_valid_for_add_sub_immediate((int)imm)) {
+    (this->*insn1)(Rd, Rn, imm);
+  } else {
+    if (uabs(imm) < (1 << 24)) {
+       (this->*insn1)(Rd, Rn, imm & -(1 << 12));
+       (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
+    } else {
+       assert_different_registers(Rd, Rn);
+       mov(Rd, (uint64_t)imm);
+       (this->*insn2)(Rd, Rn, Rd, LSL, 0);
+    }
+  }
+}
+
+// Seperate vsn which sets the flags. Optimisations are more restricted
+// because we must set the flags correctly.
+void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
+                                           add_sub_imm_insn insn1,
+                                           add_sub_reg_insn insn2) {
+  if (operand_valid_for_add_sub_immediate((int)imm)) {
+    (this->*insn1)(Rd, Rn, imm);
+  } else {
+    assert_different_registers(Rd, Rn);
+    assert(Rd != zr, "overflow in immediate operand");
+    mov(Rd, (uint64_t)imm);
+    (this->*insn2)(Rd, Rn, Rd, LSL, 0);
+  }
+}
+
+
+void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
+  if (increment.is_register()) {
+    add(Rd, Rn, increment.as_register());
+  } else {
+    add(Rd, Rn, increment.as_constant());
+  }
+}
+
+void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
+  if (increment.is_register()) {
+    addw(Rd, Rn, increment.as_register());
+  } else {
+    addw(Rd, Rn, increment.as_constant());
+  }
+}
+
+void MacroAssembler::reinit_heapbase()
+{
+  if (UseCompressedOops) {
+    if (Universe::is_fully_initialized()) {
+      mov(rheapbase, Universe::narrow_ptrs_base());
+    } else {
+      lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
+      ldr(rheapbase, Address(rheapbase));
+    }
+  }
+}
+
+// this simulates the behaviour of the x86 cmpxchg instruction using a
+// load linked/store conditional pair. we use the acquire/release
+// versions of these instructions so that we flush pending writes as
+// per Java semantics.
+
+// n.b the x86 version assumes the old value to be compared against is
+// in rax and updates rax with the value located in memory if the
+// cmpxchg fails. we supply a register for the old value explicitly
+
+// the aarch64 load linked/store conditional instructions do not
+// accept an offset. so, unlike x86, we must provide a plain register
+// to identify the memory word to be compared/exchanged rather than a
+// register+offset Address.
+
+void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
+                                Label &succeed, Label *fail) {
+  // oldv holds comparison value
+  // newv holds value to write in exchange
+  // addr identifies memory word to compare against/update
+  // tmp returns 0/1 for success/failure
+  Label retry_load, nope;
+
+  bind(retry_load);
+  // flush and load exclusive from the memory location
+  // and fail if it is not what we expect
+  ldaxr(tmp, addr);
+  cmp(tmp, oldv);
+  br(Assembler::NE, nope);
+  // if we store+flush with no intervening write tmp wil be zero
+  stlxr(tmp, newv, addr);
+  cbzw(tmp, succeed);
+  // retry so we only ever return after a load fails to compare
+  // ensures we don't return a stale value after a failed write.
+  b(retry_load);
+  // if the memory word differs we return it in oldv and signal a fail
+  bind(nope);
+  membar(AnyAny);
+  mov(oldv, tmp);
+  if (fail)
+    b(*fail);
+}
+
+void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
+                                Label &succeed, Label *fail) {
+  // oldv holds comparison value
+  // newv holds value to write in exchange
+  // addr identifies memory word to compare against/update
+  // tmp returns 0/1 for success/failure
+  Label retry_load, nope;
+
+  bind(retry_load);
+  // flush and load exclusive from the memory location
+  // and fail if it is not what we expect
+  ldaxrw(tmp, addr);
+  cmp(tmp, oldv);
+  br(Assembler::NE, nope);
+  // if we store+flush with no intervening write tmp wil be zero
+  stlxrw(tmp, newv, addr);
+  cbzw(tmp, succeed);
+  // retry so we only ever return after a load fails to compare
+  // ensures we don't return a stale value after a failed write.
+  b(retry_load);
+  // if the memory word differs we return it in oldv and signal a fail
+  bind(nope);
+  membar(AnyAny);
+  mov(oldv, tmp);
+  if (fail)
+    b(*fail);
+}
+
+static bool different(Register a, RegisterOrConstant b, Register c) {
+  if (b.is_constant())
+    return a != c;
+  else
+    return a != b.as_register() && a != c && b.as_register() != c;
+}
+
+#define ATOMIC_OP(LDXR, OP, STXR)                                       \
+void MacroAssembler::atomic_##OP(Register prev, RegisterOrConstant incr, Register addr) { \
+  Register result = rscratch2;                                          \
+  if (prev->is_valid())                                                 \
+    result = different(prev, incr, addr) ? prev : rscratch2;            \
+                                                                        \
+  Label retry_load;                                                     \
+  bind(retry_load);                                                     \
+  LDXR(result, addr);                                                   \
+  OP(rscratch1, result, incr);                                          \
+  STXR(rscratch1, rscratch1, addr);                                     \
+  cbnzw(rscratch1, retry_load);                                         \
+  if (prev->is_valid() && prev != result)                               \
+    mov(prev, result);                                                  \
+}
+
+ATOMIC_OP(ldxr, add, stxr)
+ATOMIC_OP(ldxrw, addw, stxrw)
+
+#undef ATOMIC_OP
+
+#define ATOMIC_XCHG(OP, LDXR, STXR)                                     \
+void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
+  Register result = rscratch2;                                          \
+  if (prev->is_valid())                                                 \
+    result = different(prev, newv, addr) ? prev : rscratch2;            \
+                                                                        \
+  Label retry_load;                                                     \
+  bind(retry_load);                                                     \
+  LDXR(result, addr);                                                   \
+  STXR(rscratch1, newv, addr);                                          \
+  cbnzw(rscratch1, retry_load);                                         \
+  if (prev->is_valid() && prev != result)                               \
+    mov(prev, result);                                                  \
+}
+
+ATOMIC_XCHG(xchg, ldxr, stxr)
+ATOMIC_XCHG(xchgw, ldxrw, stxrw)
+
+#undef ATOMIC_XCHG
+
+void MacroAssembler::incr_allocated_bytes(Register thread,
+                                          Register var_size_in_bytes,
+                                          int con_size_in_bytes,
+                                          Register t1) {
+  if (!thread->is_valid()) {
+    thread = rthread;
+  }
+  assert(t1->is_valid(), "need temp reg");
+
+  ldr(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
+  if (var_size_in_bytes->is_valid()) {
+    add(t1, t1, var_size_in_bytes);
+  } else {
+    add(t1, t1, con_size_in_bytes);
+  }
+  str(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
+}
+
+#ifndef PRODUCT
+extern "C" void findpc(intptr_t x);
+#endif
+
+void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
+{
+  // In order to get locks to work, we need to fake a in_VM state
+  if (ShowMessageBoxOnError ) {
+    JavaThread* thread = JavaThread::current();
+    JavaThreadState saved_state = thread->thread_state();
+    thread->set_thread_state(_thread_in_vm);
+#ifndef PRODUCT
+    if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
+      ttyLocker ttyl;
+      BytecodeCounter::print();
+    }
+#endif
+    if (os::message_box(msg, "Execution stopped, print registers?")) {
+      ttyLocker ttyl;
+      tty->print_cr(" pc = 0x%016lx", pc);
+#ifndef PRODUCT
+      tty->cr();
+      findpc(pc);
+      tty->cr();
+#endif
+      tty->print_cr(" r0 = 0x%016lx", regs[0]);
+      tty->print_cr(" r1 = 0x%016lx", regs[1]);
+      tty->print_cr(" r2 = 0x%016lx", regs[2]);
+      tty->print_cr(" r3 = 0x%016lx", regs[3]);
+      tty->print_cr(" r4 = 0x%016lx", regs[4]);
+      tty->print_cr(" r5 = 0x%016lx", regs[5]);
+      tty->print_cr(" r6 = 0x%016lx", regs[6]);
+      tty->print_cr(" r7 = 0x%016lx", regs[7]);
+      tty->print_cr(" r8 = 0x%016lx", regs[8]);
+      tty->print_cr(" r9 = 0x%016lx", regs[9]);
+      tty->print_cr("r10 = 0x%016lx", regs[10]);
+      tty->print_cr("r11 = 0x%016lx", regs[11]);
+      tty->print_cr("r12 = 0x%016lx", regs[12]);
+      tty->print_cr("r13 = 0x%016lx", regs[13]);
+      tty->print_cr("r14 = 0x%016lx", regs[14]);
+      tty->print_cr("r15 = 0x%016lx", regs[15]);
+      tty->print_cr("r16 = 0x%016lx", regs[16]);
+      tty->print_cr("r17 = 0x%016lx", regs[17]);
+      tty->print_cr("r18 = 0x%016lx", regs[18]);
+      tty->print_cr("r19 = 0x%016lx", regs[19]);
+      tty->print_cr("r20 = 0x%016lx", regs[20]);
+      tty->print_cr("r21 = 0x%016lx", regs[21]);
+      tty->print_cr("r22 = 0x%016lx", regs[22]);
+      tty->print_cr("r23 = 0x%016lx", regs[23]);
+      tty->print_cr("r24 = 0x%016lx", regs[24]);
+      tty->print_cr("r25 = 0x%016lx", regs[25]);
+      tty->print_cr("r26 = 0x%016lx", regs[26]);
+      tty->print_cr("r27 = 0x%016lx", regs[27]);
+      tty->print_cr("r28 = 0x%016lx", regs[28]);
+      tty->print_cr("r30 = 0x%016lx", regs[30]);
+      tty->print_cr("r31 = 0x%016lx", regs[31]);
+      BREAKPOINT;
+    }
+    ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
+  } else {
+    ttyLocker ttyl;
+    ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
+                    msg);
+    assert(false, err_msg("DEBUG MESSAGE: %s", msg));
+  }
+}
+
+#ifdef BUILTIN_SIM
+// routine to generate an x86 prolog for a stub function which
+// bootstraps into the generated ARM code which directly follows the
+// stub
+//
+// the argument encodes the number of general and fp registers
+// passed by the caller and the callng convention (currently just
+// the number of general registers and assumes C argument passing)
+
+extern "C" {
+int aarch64_stub_prolog_size();
+void aarch64_stub_prolog();
+void aarch64_prolog();
+}
+
+void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
+                                   address *prolog_ptr)
+{
+  int calltype = (((ret_type & 0x3) << 8) |
+                  ((fp_arg_count & 0xf) << 4) |
+                  (gp_arg_count & 0xf));
+
+  // the addresses for the x86 to ARM entry code we need to use
+  address start = pc();
+  // printf("start = %lx\n", start);
+  int byteCount =  aarch64_stub_prolog_size();
+  // printf("byteCount = %x\n", byteCount);
+  int instructionCount = (byteCount + 3)/ 4;
+  // printf("instructionCount = %x\n", instructionCount);
+  for (int i = 0; i < instructionCount; i++) {
+    nop();
+  }
+
+  memcpy(start, (void*)aarch64_stub_prolog, byteCount);
+
+  // write the address of the setup routine and the call format at the
+  // end of into the copied code
+  u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
+  if (prolog_ptr)
+    patch_end[-2] = (u_int64_t)prolog_ptr;
+  patch_end[-1] = calltype;
+}
+#endif
+
+void MacroAssembler::push_CPU_state() {
+    push(0x3fffffff, sp);         // integer registers except lr & sp
+
+    for (int i = 30; i >= 0; i -= 2)
+      stpd(as_FloatRegister(i), as_FloatRegister(i+1),
+           Address(pre(sp, -2 * wordSize)));
+}
+
+void MacroAssembler::pop_CPU_state() {
+  for (int i = 0; i < 32; i += 2)
+    ldpd(as_FloatRegister(i), as_FloatRegister(i+1),
+         Address(post(sp, 2 * wordSize)));
+
+  pop(0x3fffffff, sp);         // integer registers except lr & sp
+}
+
+/**
+ * Emits code to update CRC-32 with a byte value according to constants in table
+ *
+ * @param [in,out]crc   Register containing the crc.
+ * @param [in]val       Register containing the byte to fold into the CRC.
+ * @param [in]table     Register containing the table of crc constants.
+ *
+ * uint32_t crc;
+ * val = crc_table[(val ^ crc) & 0xFF];
+ * crc = val ^ (crc >> 8);
+ *
+ */
+void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
+  eor(val, val, crc);
+  andr(val, val, 0xff);
+  ldrw(val, Address(table, val, Address::lsl(2)));
+  eor(crc, val, crc, Assembler::LSR, 8);
+}
+
+/**
+ * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
+ *
+ * @param [in,out]crc   Register containing the crc.
+ * @param [in]v         Register containing the 32-bit to fold into the CRC.
+ * @param [in]table0    Register containing table 0 of crc constants.
+ * @param [in]table1    Register containing table 1 of crc constants.
+ * @param [in]table2    Register containing table 2 of crc constants.
+ * @param [in]table3    Register containing table 3 of crc constants.
+ *
+ * uint32_t crc;
+ *   v = crc ^ v
+ *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
+ *
+ */
+void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
+        Register table0, Register table1, Register table2, Register table3,
+        bool upper) {
+  eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
+  uxtb(tmp, v);
+  ldrw(crc, Address(table3, tmp, Address::lsl(2)));
+  ubfx(tmp, v, 8, 8);
+  ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
+  eor(crc, crc, tmp);
+  ubfx(tmp, v, 16, 8);
+  ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
+  eor(crc, crc, tmp);
+  ubfx(tmp, v, 24, 8);
+  ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
+  eor(crc, crc, tmp);
+}
+
+/**
+ * @param crc   register containing existing CRC (32-bit)
+ * @param buf   register pointing to input byte buffer (byte*)
+ * @param len   register containing number of bytes
+ * @param table register that will contain address of CRC table
+ * @param tmp   scratch register
+ */
+void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
+        Register table0, Register table1, Register table2, Register table3,
+        Register tmp, Register tmp2, Register tmp3) {
+  Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
+  unsigned long offset;
+
+    ornw(crc, zr, crc);
+
+  if (UseCRC32) {
+    Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop;
+
+      subs(len, len, 64);
+      br(Assembler::GE, CRC_by64_loop);
+      adds(len, len, 64-4);
+      br(Assembler::GE, CRC_by4_loop);
+      adds(len, len, 4);
+      br(Assembler::GT, CRC_by1_loop);
+      b(L_exit);
+
+    BIND(CRC_by4_loop);
+      ldrw(tmp, Address(post(buf, 4)));
+      subs(len, len, 4);
+      crc32w(crc, crc, tmp);
+      br(Assembler::GE, CRC_by4_loop);
+      adds(len, len, 4);
+      br(Assembler::LE, L_exit);
+    BIND(CRC_by1_loop);
+      ldrb(tmp, Address(post(buf, 1)));
+      subs(len, len, 1);
+      crc32b(crc, crc, tmp);
+      br(Assembler::GT, CRC_by1_loop);
+      b(L_exit);
+
+      align(CodeEntryAlignment);
+    BIND(CRC_by64_loop);
+      subs(len, len, 64);
+      ldp(tmp, tmp3, Address(post(buf, 16)));
+      crc32x(crc, crc, tmp);
+      crc32x(crc, crc, tmp3);
+      ldp(tmp, tmp3, Address(post(buf, 16)));
+      crc32x(crc, crc, tmp);
+      crc32x(crc, crc, tmp3);
+      ldp(tmp, tmp3, Address(post(buf, 16)));
+      crc32x(crc, crc, tmp);
+      crc32x(crc, crc, tmp3);
+      ldp(tmp, tmp3, Address(post(buf, 16)));
+      crc32x(crc, crc, tmp);
+      crc32x(crc, crc, tmp3);
+      br(Assembler::GE, CRC_by64_loop);
+      adds(len, len, 64-4);
+      br(Assembler::GE, CRC_by4_loop);
+      adds(len, len, 4);
+      br(Assembler::GT, CRC_by1_loop);
+    BIND(L_exit);
+      ornw(crc, zr, crc);
+      return;
+  }
+
+    adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
+    if (offset) add(table0, table0, offset);
+    add(table1, table0, 1*256*sizeof(juint));
+    add(table2, table0, 2*256*sizeof(juint));
+    add(table3, table0, 3*256*sizeof(juint));
+
+  if (UseNeon) {
+      cmp(len, 64);
+      br(Assembler::LT, L_by16);
+      eor(v16, T16B, v16, v16);
+
+    Label L_fold;
+
+      add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
+
+      ld1(v0, v1, T2D, post(buf, 32));
+      ld1r(v4, T2D, post(tmp, 8));
+      ld1r(v5, T2D, post(tmp, 8));
+      ld1r(v6, T2D, post(tmp, 8));
+      ld1r(v7, T2D, post(tmp, 8));
+      mov(v16, T4S, 0, crc);
+
+      eor(v0, T16B, v0, v16);
+      sub(len, len, 64);
+
+    BIND(L_fold);
+      pmull(v22, T8H, v0, v5, T8B);
+      pmull(v20, T8H, v0, v7, T8B);
+      pmull(v23, T8H, v0, v4, T8B);
+      pmull(v21, T8H, v0, v6, T8B);
+
+      pmull2(v18, T8H, v0, v5, T16B);
+      pmull2(v16, T8H, v0, v7, T16B);
+      pmull2(v19, T8H, v0, v4, T16B);
+      pmull2(v17, T8H, v0, v6, T16B);
+
+      uzp1(v24, v20, v22, T8H);
+      uzp2(v25, v20, v22, T8H);
+      eor(v20, T16B, v24, v25);
+
+      uzp1(v26, v16, v18, T8H);
+      uzp2(v27, v16, v18, T8H);
+      eor(v16, T16B, v26, v27);
+
+      ushll2(v22, T4S, v20, T8H, 8);
+      ushll(v20, T4S, v20, T4H, 8);
+
+      ushll2(v18, T4S, v16, T8H, 8);
+      ushll(v16, T4S, v16, T4H, 8);
+
+      eor(v22, T16B, v23, v22);
+      eor(v18, T16B, v19, v18);
+      eor(v20, T16B, v21, v20);
+      eor(v16, T16B, v17, v16);
+
+      uzp1(v17, v16, v20, T2D);
+      uzp2(v21, v16, v20, T2D);
+      eor(v17, T16B, v17, v21);
+
+      ushll2(v20, T2D, v17, T4S, 16);
+      ushll(v16, T2D, v17, T2S, 16);
+
+      eor(v20, T16B, v20, v22);
+      eor(v16, T16B, v16, v18);
+
+      uzp1(v17, v20, v16, T2D);
+      uzp2(v21, v20, v16, T2D);
+      eor(v28, T16B, v17, v21);
+
+      pmull(v22, T8H, v1, v5, T8B);
+      pmull(v20, T8H, v1, v7, T8B);
+      pmull(v23, T8H, v1, v4, T8B);
+      pmull(v21, T8H, v1, v6, T8B);
+
+      pmull2(v18, T8H, v1, v5, T16B);
+      pmull2(v16, T8H, v1, v7, T16B);
+      pmull2(v19, T8H, v1, v4, T16B);
+      pmull2(v17, T8H, v1, v6, T16B);
+
+      ld1(v0, v1, T2D, post(buf, 32));
+
+      uzp1(v24, v20, v22, T8H);
+      uzp2(v25, v20, v22, T8H);
+      eor(v20, T16B, v24, v25);
+
+      uzp1(v26, v16, v18, T8H);
+      uzp2(v27, v16, v18, T8H);
+      eor(v16, T16B, v26, v27);
+
+      ushll2(v22, T4S, v20, T8H, 8);
+      ushll(v20, T4S, v20, T4H, 8);
+
+      ushll2(v18, T4S, v16, T8H, 8);
+      ushll(v16, T4S, v16, T4H, 8);
+
+      eor(v22, T16B, v23, v22);
+      eor(v18, T16B, v19, v18);
+      eor(v20, T16B, v21, v20);
+      eor(v16, T16B, v17, v16);
+
+      uzp1(v17, v16, v20, T2D);
+      uzp2(v21, v16, v20, T2D);
+      eor(v16, T16B, v17, v21);
+
+      ushll2(v20, T2D, v16, T4S, 16);
+      ushll(v16, T2D, v16, T2S, 16);
+
+      eor(v20, T16B, v22, v20);
+      eor(v16, T16B, v16, v18);
+
+      uzp1(v17, v20, v16, T2D);
+      uzp2(v21, v20, v16, T2D);
+      eor(v20, T16B, v17, v21);
+
+      shl(v16, v28, T2D, 1);
+      shl(v17, v20, T2D, 1);
+
+      eor(v0, T16B, v0, v16);
+      eor(v1, T16B, v1, v17);
+
+      subs(len, len, 32);
+      br(Assembler::GE, L_fold);
+
+      mov(crc, 0);
+      mov(tmp, v0, T1D, 0);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
+      mov(tmp, v0, T1D, 1);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
+      mov(tmp, v1, T1D, 0);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
+      mov(tmp, v1, T1D, 1);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
+      update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
+
+      add(len, len, 32);
+  }
+
+  BIND(L_by16);
+    subs(len, len, 16);
+    br(Assembler::GE, L_by16_loop);
+    adds(len, len, 16-4);
+    br(Assembler::GE, L_by4_loop);
+    adds(len, len, 4);
+    br(Assembler::GT, L_by1_loop);
+    b(L_exit);
+
+  BIND(L_by4_loop);
+    ldrw(tmp, Address(post(buf, 4)));
+    update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
+    subs(len, len, 4);
+    br(Assembler::GE, L_by4_loop);
+    adds(len, len, 4);
+    br(Assembler::LE, L_exit);
+  BIND(L_by1_loop);
+    subs(len, len, 1);
+    ldrb(tmp, Address(post(buf, 1)));
+    update_byte_crc32(crc, tmp, table0);
+    br(Assembler::GT, L_by1_loop);
+    b(L_exit);
+
+    align(CodeEntryAlignment);
+  BIND(L_by16_loop);
+    subs(len, len, 16);
+    ldp(tmp, tmp3, Address(post(buf, 16)));
+    update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
+    update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
+    update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
+    update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
+    br(Assembler::GE, L_by16_loop);
+    adds(len, len, 16-4);
+    br(Assembler::GE, L_by4_loop);
+    adds(len, len, 4);
+    br(Assembler::GT, L_by1_loop);
+  BIND(L_exit);
+    ornw(crc, zr, crc);
+}
+
+SkipIfEqual::SkipIfEqual(
+    MacroAssembler* masm, const bool* flag_addr, bool value) {
+  _masm = masm;
+  unsigned long offset;
+  _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
+  _masm->ldrb(rscratch1, Address(rscratch1, offset));
+  _masm->cbzw(rscratch1, _label);
+}
+
+SkipIfEqual::~SkipIfEqual() {
+  _masm->bind(_label);
+}
+
+void MacroAssembler::cmpptr(Register src1, Address src2) {
+  unsigned long offset;
+  adrp(rscratch1, src2, offset);
+  ldr(rscratch1, Address(rscratch1, offset));
+  cmp(src1, rscratch1);
+}
+
+void MacroAssembler::store_check(Register obj) {
+  // Does a store check for the oop in register obj. The content of
+  // register obj is destroyed afterwards.
+  store_check_part_1(obj);
+  store_check_part_2(obj);
+}
+
+void MacroAssembler::store_check(Register obj, Address dst) {
+  store_check(obj);
+}
+
+
+// split the store check operation so that other instructions can be scheduled inbetween
+void MacroAssembler::store_check_part_1(Register obj) {
+  BarrierSet* bs = Universe::heap()->barrier_set();
+  assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
+  lsr(obj, obj, CardTableModRefBS::card_shift);
+}
+
+void MacroAssembler::store_check_part_2(Register obj) {
+  BarrierSet* bs = Universe::heap()->barrier_set();
+  assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
+  CardTableModRefBS* ct = (CardTableModRefBS*)bs;
+  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
+
+  // The calculation for byte_map_base is as follows:
+  // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
+  // So this essentially converts an address to a displacement and
+  // it will never need to be relocated.
+
+  // FIXME: It's not likely that disp will fit into an offset so we
+  // don't bother to check, but it could save an instruction.
+  intptr_t disp = (intptr_t) ct->byte_map_base;
+  mov(rscratch1, disp);
+  strb(zr, Address(obj, rscratch1));
+}
+
+void MacroAssembler::load_klass(Register dst, Register src) {
+  if (UseCompressedClassPointers) {
+    ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
+    decode_klass_not_null(dst);
+  } else {
+    ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
+  }
+}
+
+void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
+  if (UseCompressedClassPointers) {
+    ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
+    if (Universe::narrow_klass_base() == NULL) {
+      cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
+      return;
+    } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
+               && Universe::narrow_klass_shift() == 0) {
+      // Only the bottom 32 bits matter
+      cmpw(trial_klass, tmp);
+      return;
+    }
+    decode_klass_not_null(tmp);
+  } else {
+    ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
+  }
+  cmp(trial_klass, tmp);
+}
+
+void MacroAssembler::load_prototype_header(Register dst, Register src) {
+  load_klass(dst, src);
+  ldr(dst, Address(dst, Klass::prototype_header_offset()));
+}
+
+void MacroAssembler::store_klass(Register dst, Register src) {
+  // FIXME: Should this be a store release?  concurrent gcs assumes
+  // klass length is valid if klass field is not null.
+  if (UseCompressedClassPointers) {
+    encode_klass_not_null(src);
+    strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
+  } else {
+    str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
+  }
+}
+
+void MacroAssembler::store_klass_gap(Register dst, Register src) {
+  if (UseCompressedClassPointers) {
+    // Store to klass gap in destination
+    strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
+  }
+}
+
+// Algorithm must match oop.inline.hpp encode_heap_oop.
+void MacroAssembler::encode_heap_oop(Register d, Register s) {
+#ifdef ASSERT
+  verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
+#endif
+  verify_oop(s, "broken oop in encode_heap_oop");
+  if (Universe::narrow_oop_base() == NULL) {
+    if (Universe::narrow_oop_shift() != 0) {
+      assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
+      lsr(d, s, LogMinObjAlignmentInBytes);
+    } else {
+      mov(d, s);
+    }
+  } else {
+    subs(d, s, rheapbase);
+    csel(d, d, zr, Assembler::HS);
+    lsr(d, d, LogMinObjAlignmentInBytes);
+
+    /*  Old algorithm: is this any worse?
+    Label nonnull;
+    cbnz(r, nonnull);
+    sub(r, r, rheapbase);
+    bind(nonnull);
+    lsr(r, r, LogMinObjAlignmentInBytes);
+    */
+  }
+}
+
+void MacroAssembler::encode_heap_oop_not_null(Register r) {
+#ifdef ASSERT
+  verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
+  if (CheckCompressedOops) {
+    Label ok;
+    cbnz(r, ok);
+    stop("null oop passed to encode_heap_oop_not_null");
+    bind(ok);
+  }
+#endif
+  verify_oop(r, "broken oop in encode_heap_oop_not_null");
+  if (Universe::narrow_oop_base() != NULL) {
+    sub(r, r, rheapbase);
+  }
+  if (Universe::narrow_oop_shift() != 0) {
+    assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
+    lsr(r, r, LogMinObjAlignmentInBytes);
+  }
+}
+
+void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
+#ifdef ASSERT
+  verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
+  if (CheckCompressedOops) {
+    Label ok;
+    cbnz(src, ok);
+    stop("null oop passed to encode_heap_oop_not_null2");
+    bind(ok);
+  }
+#endif
+  verify_oop(src, "broken oop in encode_heap_oop_not_null2");
+
+  Register data = src;
+  if (Universe::narrow_oop_base() != NULL) {
+    sub(dst, src, rheapbase);
+    data = dst;
+  }
+  if (Universe::narrow_oop_shift() != 0) {
+    assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
+    lsr(dst, data, LogMinObjAlignmentInBytes);
+    data = dst;
+  }
+  if (data == src)
+    mov(dst, src);
+}
+
+void  MacroAssembler::decode_heap_oop(Register d, Register s) {
+#ifdef ASSERT
+  verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
+#endif
+  if (Universe::narrow_oop_base() == NULL) {
+    if (Universe::narrow_oop_shift() != 0 || d != s) {
+      lsl(d, s, Universe::narrow_oop_shift());
+    }
+  } else {
+    Label done;
+    if (d != s)
+      mov(d, s);
+    cbz(s, done);
+    add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
+    bind(done);
+  }
+  verify_oop(d, "broken oop in decode_heap_oop");
+}
+
+void  MacroAssembler::decode_heap_oop_not_null(Register r) {
+  assert (UseCompressedOops, "should only be used for compressed headers");
+  assert (Universe::heap() != NULL, "java heap should be initialized");
+  // Cannot assert, unverified entry point counts instructions (see .ad file)
+  // vtableStubs also counts instructions in pd_code_size_limit.
+  // Also do not verify_oop as this is called by verify_oop.
+  if (Universe::narrow_oop_shift() != 0) {
+    assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
+    if (Universe::narrow_oop_base() != NULL) {
+      add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
+    } else {
+      add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
+    }
+  } else {
+    assert (Universe::narrow_oop_base() == NULL, "sanity");
+  }
+}
+
+void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
+  assert (UseCompressedOops, "should only be used for compressed headers");
+  assert (Universe::heap() != NULL, "java heap should be initialized");
+  // Cannot assert, unverified entry point counts instructions (see .ad file)
+  // vtableStubs also counts instructions in pd_code_size_limit.
+  // Also do not verify_oop as this is called by verify_oop.
+  if (Universe::narrow_oop_shift() != 0) {
+    assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
+    if (Universe::narrow_oop_base() != NULL) {
+      add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
+    } else {
+      add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
+    }
+  } else {
+    assert (Universe::narrow_oop_base() == NULL, "sanity");
+    if (dst != src) {
+      mov(dst, src);
+    }
+  }
+}
+
+void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
+  if (Universe::narrow_klass_base() == NULL) {
+    if (Universe::narrow_klass_shift() != 0) {
+      assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
+      lsr(dst, src, LogKlassAlignmentInBytes);
+    } else {
+      if (dst != src) mov(dst, src);
+    }
+    return;
+  }
+
+  if (use_XOR_for_compressed_class_base) {
+    if (Universe::narrow_klass_shift() != 0) {
+      eor(dst, src, (uint64_t)Universe::narrow_klass_base());
+      lsr(dst, dst, LogKlassAlignmentInBytes);
+    } else {
+      eor(dst, src, (uint64_t)Universe::narrow_klass_base());
+    }
+    return;
+  }
+
+  if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
+      && Universe::narrow_klass_shift() == 0) {
+    movw(dst, src);
+    return;
+  }
+
+#ifdef ASSERT
+  verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
+#endif
+
+  Register rbase = dst;
+  if (dst == src) rbase = rheapbase;
+  mov(rbase, (uint64_t)Universe::narrow_klass_base());
+  sub(dst, src, rbase);
+  if (Universe::narrow_klass_shift() != 0) {
+    assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
+    lsr(dst, dst, LogKlassAlignmentInBytes);
+  }
+  if (dst == src) reinit_heapbase();
+}
+
+void MacroAssembler::encode_klass_not_null(Register r) {
+  encode_klass_not_null(r, r);
+}
+
+void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
+  Register rbase = dst;
+  assert (UseCompressedClassPointers, "should only be used for compressed headers");
+
+  if (Universe::narrow_klass_base() == NULL) {
+    if (Universe::narrow_klass_shift() != 0) {
+      assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
+      lsl(dst, src, LogKlassAlignmentInBytes);
+    } else {
+      if (dst != src) mov(dst, src);
+    }
+    return;
+  }
+
+  if (use_XOR_for_compressed_class_base) {
+    if (Universe::narrow_klass_shift() != 0) {
+      lsl(dst, src, LogKlassAlignmentInBytes);
+      eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
+    } else {
+      eor(dst, src, (uint64_t)Universe::narrow_klass_base());
+    }
+    return;
+  }
+
+  if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
+      && Universe::narrow_klass_shift() == 0) {
+    if (dst != src)
+      movw(dst, src);
+    movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
+    return;
+  }
+
+  // Cannot assert, unverified entry point counts instructions (see .ad file)
+  // vtableStubs also counts instructions in pd_code_size_limit.
+  // Also do not verify_oop as this is called by verify_oop.
+  if (dst == src) rbase = rheapbase;
+  mov(rbase, (uint64_t)Universe::narrow_klass_base());
+  if (Universe::narrow_klass_shift() != 0) {
+    assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
+    add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
+  } else {
+    add(dst, rbase, src);
+  }
+  if (dst == src) reinit_heapbase();
+}
+
+void  MacroAssembler::decode_klass_not_null(Register r) {
+  decode_klass_not_null(r, r);
+}
+
+void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
+  assert (UseCompressedOops, "should only be used for compressed oops");
+  assert (Universe::heap() != NULL, "java heap should be initialized");
+  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
+
+  int oop_index = oop_recorder()->find_index(obj);
+  assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
+
+  InstructionMark im(this);
+  RelocationHolder rspec = oop_Relocation::spec(oop_index);
+  code_section()->relocate(inst_mark(), rspec);
+  movz(dst, 0xDEAD, 16);
+  movk(dst, 0xBEEF);
+}
+
+void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
+  assert (UseCompressedClassPointers, "should only be used for compressed headers");
+  assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
+  int index = oop_recorder()->find_index(k);
+  assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
+
+  InstructionMark im(this);
+  RelocationHolder rspec = metadata_Relocation::spec(index);
+  code_section()->relocate(inst_mark(), rspec);
+  narrowKlass nk = Klass::encode_klass(k);
+  movz(dst, (nk >> 16), 16);
+  movk(dst, nk & 0xffff);
+}
+
+void MacroAssembler::load_heap_oop(Register dst, Address src)
+{
+  if (UseCompressedOops) {
+    ldrw(dst, src);
+    decode_heap_oop(dst);
+  } else {
+    ldr(dst, src);
+  }
+}
+
+void MacroAssembler::load_heap_oop_not_null(Register dst, Address src)
+{
+  if (UseCompressedOops) {
+    ldrw(dst, src);
+    decode_heap_oop_not_null(dst);
+  } else {
+    ldr(dst, src);
+  }
+}
+
+void MacroAssembler::store_heap_oop(Address dst, Register src) {
+  if (UseCompressedOops) {
+    assert(!dst.uses(src), "not enough registers");
+    encode_heap_oop(src);
+    strw(src, dst);
+  } else
+    str(src, dst);
+}
+
+// Used for storing NULLs.
+void MacroAssembler::store_heap_oop_null(Address dst) {
+  if (UseCompressedOops) {
+    strw(zr, dst);
+  } else
+    str(zr, dst);
+}
+
+#if INCLUDE_ALL_GCS
+void MacroAssembler::g1_write_barrier_pre(Register obj,
+                                          Register pre_val,
+                                          Register thread,
+                                          Register tmp,
+                                          bool tosca_live,
+                                          bool expand_call) {
+  // If expand_call is true then we expand the call_VM_leaf macro
+  // directly to skip generating the check by
+  // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
+
+  assert(thread == rthread, "must be");
+
+  Label done;
+  Label runtime;
+
+  assert(pre_val != noreg, "check this code");
+
+  if (obj != noreg)
+    assert_different_registers(obj, pre_val, tmp);
+
+  Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                       PtrQueue::byte_offset_of_active()));
+  Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                       PtrQueue::byte_offset_of_index()));
+  Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                       PtrQueue::byte_offset_of_buf()));
+
+
+  // Is marking active?
+  if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
+    ldrw(tmp, in_progress);
+  } else {
+    assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
+    ldrb(tmp, in_progress);
+  }
+  cbzw(tmp, done);
+
+  // Do we need to load the previous value?
+  if (obj != noreg) {
+    load_heap_oop(pre_val, Address(obj, 0));
+  }
+
+  // Is the previous value null?
+  cbz(pre_val, done);
+
+  // Can we store original value in the thread's buffer?
+  // Is index == 0?
+  // (The index field is typed as size_t.)
+
+  ldr(tmp, index);                      // tmp := *index_adr
+  cbz(tmp, runtime);                    // tmp == 0?
+                                        // If yes, goto runtime
+
+  sub(tmp, tmp, wordSize);              // tmp := tmp - wordSize
+  str(tmp, index);                      // *index_adr := tmp
+  ldr(rscratch1, buffer);
+  add(tmp, tmp, rscratch1);             // tmp := tmp + *buffer_adr
+
+  // Record the previous value
+  str(pre_val, Address(tmp, 0));
+  b(done);
+
+  bind(runtime);
+  // save the live input values
+  push(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
+
+  // Calling the runtime using the regular call_VM_leaf mechanism generates
+  // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
+  // that checks that the *(rfp+frame::interpreter_frame_last_sp) == NULL.
+  //
+  // If we care generating the pre-barrier without a frame (e.g. in the
+  // intrinsified Reference.get() routine) then ebp might be pointing to
+  // the caller frame and so this check will most likely fail at runtime.
+  //
+  // Expanding the call directly bypasses the generation of the check.
+  // So when we do not have have a full interpreter frame on the stack
+  // expand_call should be passed true.
+
+  if (expand_call) {
+    assert(pre_val != c_rarg1, "smashed arg");
+    pass_arg1(this, thread);
+    pass_arg0(this, pre_val);
+    MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
+  } else {
+    call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
+  }
+
+  pop(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
+
+  bind(done);
+}
+
+void MacroAssembler::g1_write_barrier_post(Register store_addr,
+                                           Register new_val,
+                                           Register thread,
+                                           Register tmp,
+                                           Register tmp2) {
+  assert(thread == rthread, "must be");
+
+  Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
+                                       PtrQueue::byte_offset_of_index()));
+  Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
+                                       PtrQueue::byte_offset_of_buf()));
+
+  BarrierSet* bs = Universe::heap()->barrier_set();
+  CardTableModRefBS* ct = (CardTableModRefBS*)bs;
+  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
+
+  Label done;
+  Label runtime;
+
+  // Does store cross heap regions?
+
+  eor(tmp, store_addr, new_val);
+  lsr(tmp, tmp, HeapRegion::LogOfHRGrainBytes);
+  cbz(tmp, done);
+
+  // crosses regions, storing NULL?
+
+  cbz(new_val, done);
+
+  // storing region crossing non-NULL, is card already dirty?
+
+  ExternalAddress cardtable((address) ct->byte_map_base);
+  assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
+  const Register card_addr = tmp;
+
+  lsr(card_addr, store_addr, CardTableModRefBS::card_shift);
+
+  unsigned long offset;
+  adrp(tmp2, cardtable, offset);
+
+  // get the address of the card
+  add(card_addr, card_addr, tmp2);
+  ldrb(tmp2, Address(card_addr, offset));
+  cmpw(tmp2, (int)G1SATBCardTableModRefBS::g1_young_card_val());
+  br(Assembler::EQ, done);
+
+  assert((int)CardTableModRefBS::dirty_card_val() == 0, "must be 0");
+
+  membar(Assembler::StoreLoad);
+
+  ldrb(tmp2, Address(card_addr, offset));
+  cbzw(tmp2, done);
+
+  // storing a region crossing, non-NULL oop, card is clean.
+  // dirty card and log.
+
+  strb(zr, Address(card_addr, offset));
+
+  ldr(rscratch1, queue_index);
+  cbz(rscratch1, runtime);
+  sub(rscratch1, rscratch1, wordSize);
+  str(rscratch1, queue_index);
+
+  ldr(tmp2, buffer);
+  str(card_addr, Address(tmp2, rscratch1));
+  b(done);
+
+  bind(runtime);
+  // save the live input values
+  push(store_addr->bit(true) | new_val->bit(true), sp);
+  call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
+  pop(store_addr->bit(true) | new_val->bit(true), sp);
+
+  bind(done);
+}
+
+#endif // INCLUDE_ALL_GCS
+
+Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
+  assert(oop_recorder() != NULL, "this assembler needs a Recorder");
+  int index = oop_recorder()->allocate_metadata_index(obj);
+  RelocationHolder rspec = metadata_Relocation::spec(index);
+  return Address((address)obj, rspec);
+}
+
+// Move an oop into a register.  immediate is true if we want
+// immediate instrcutions, i.e. we are not going to patch this
+// instruction while the code is being executed by another thread.  In
+// that case we can use move immediates rather than the constant pool.
+void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
+  int oop_index;
+  if (obj == NULL) {
+    oop_index = oop_recorder()->allocate_oop_index(obj);
+  } else {
+    oop_index = oop_recorder()->find_index(obj);
+    assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
+  }
+  RelocationHolder rspec = oop_Relocation::spec(oop_index);
+  if (! immediate) {
+    address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
+    ldr_constant(dst, Address(dummy, rspec));
+  } else
+    mov(dst, Address((address)obj, rspec));
+}
+
+// Move a metadata address into a register.
+void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
+  int oop_index;
+  if (obj == NULL) {
+    oop_index = oop_recorder()->allocate_metadata_index(obj);
+  } else {
+    oop_index = oop_recorder()->find_index(obj);
+  }
+  RelocationHolder rspec = metadata_Relocation::spec(oop_index);
+  mov(dst, Address((address)obj, rspec));
+}
+
+Address MacroAssembler::constant_oop_address(jobject obj) {
+  assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
+  assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
+  int oop_index = oop_recorder()->find_index(obj);
+  return Address((address)obj, oop_Relocation::spec(oop_index));
+}
+
+// Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
+void MacroAssembler::tlab_allocate(Register obj,
+                                   Register var_size_in_bytes,
+                                   int con_size_in_bytes,
+                                   Register t1,
+                                   Register t2,
+                                   Label& slow_case) {
+  assert_different_registers(obj, t2);
+  assert_different_registers(obj, var_size_in_bytes);
+  Register end = t2;
+
+  // verify_tlab();
+
+  ldr(obj, Address(rthread, JavaThread::tlab_top_offset()));
+  if (var_size_in_bytes == noreg) {
+    lea(end, Address(obj, con_size_in_bytes));
+  } else {
+    lea(end, Address(obj, var_size_in_bytes));
+  }
+  ldr(rscratch1, Address(rthread, JavaThread::tlab_end_offset()));
+  cmp(end, rscratch1);
+  br(Assembler::HI, slow_case);
+
+  // update the tlab top pointer
+  str(end, Address(rthread, JavaThread::tlab_top_offset()));
+
+  // recover var_size_in_bytes if necessary
+  if (var_size_in_bytes == end) {
+    sub(var_size_in_bytes, var_size_in_bytes, obj);
+  }
+  // verify_tlab();
+}
+
+// Preserves r19, and r3.
+Register MacroAssembler::tlab_refill(Label& retry,
+                                     Label& try_eden,
+                                     Label& slow_case) {
+  Register top = r0;
+  Register t1  = r2;
+  Register t2  = r4;
+  assert_different_registers(top, rthread, t1, t2, /* preserve: */ r19, r3);
+  Label do_refill, discard_tlab;
+
+  if (!Universe::heap()->supports_inline_contig_alloc()) {
+    // No allocation in the shared eden.
+    b(slow_case);
+  }
+
+  ldr(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
+  ldr(t1,  Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
+
+  // calculate amount of free space
+  sub(t1, t1, top);
+  lsr(t1, t1, LogHeapWordSize);
+
+  // Retain tlab and allocate object in shared space if
+  // the amount free in the tlab is too large to discard.
+
+  ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
+  cmp(t1, rscratch1);
+  br(Assembler::LE, discard_tlab);
+
+  // Retain
+  // ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
+  mov(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
+  add(rscratch1, rscratch1, t2);
+  str(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
+
+  if (TLABStats) {
+    // increment number of slow_allocations
+    addmw(Address(rthread, in_bytes(JavaThread::tlab_slow_allocations_offset())),
+         1, rscratch1);
+  }
+  b(try_eden);
+
+  bind(discard_tlab);
+  if (TLABStats) {
+    // increment number of refills
+    addmw(Address(rthread, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1,
+         rscratch1);
+    // accumulate wastage -- t1 is amount free in tlab
+    addmw(Address(rthread, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1,
+         rscratch1);
+  }
+
+  // if tlab is currently allocated (top or end != null) then
+  // fill [top, end + alignment_reserve) with array object
+  cbz(top, do_refill);
+
+  // set up the mark word
+  mov(rscratch1, (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
+  str(rscratch1, Address(top, oopDesc::mark_offset_in_bytes()));
+  // set the length to the remaining space
+  sub(t1, t1, typeArrayOopDesc::header_size(T_INT));
+  add(t1, t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
+  lsl(t1, t1, log2_intptr(HeapWordSize/sizeof(jint)));
+  strw(t1, Address(top, arrayOopDesc::length_offset_in_bytes()));
+  // set klass to intArrayKlass
+  {
+    unsigned long offset;
+    // dubious reloc why not an oop reloc?
+    adrp(rscratch1, ExternalAddress((address)Universe::intArrayKlassObj_addr()),
+         offset);
+    ldr(t1, Address(rscratch1, offset));
+  }
+  // store klass last.  concurrent gcs assumes klass length is valid if
+  // klass field is not null.
+  store_klass(top, t1);
+
+  mov(t1, top);
+  ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
+  sub(t1, t1, rscratch1);
+  incr_allocated_bytes(rthread, t1, 0, rscratch1);
+
+  // refill the tlab with an eden allocation
+  bind(do_refill);
+  ldr(t1, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
+  lsl(t1, t1, LogHeapWordSize);
+  // allocate new tlab, address returned in top
+  eden_allocate(top, t1, 0, t2, slow_case);
+
+  // Check that t1 was preserved in eden_allocate.
+#ifdef ASSERT
+  if (UseTLAB) {
+    Label ok;
+    Register tsize = r4;
+    assert_different_registers(tsize, rthread, t1);
+    str(tsize, Address(pre(sp, -16)));
+    ldr(tsize, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
+    lsl(tsize, tsize, LogHeapWordSize);
+    cmp(t1, tsize);
+    br(Assembler::EQ, ok);
+    STOP("assert(t1 != tlab size)");
+    should_not_reach_here();
+
+    bind(ok);
+    ldr(tsize, Address(post(sp, 16)));
+  }
+#endif
+  str(top, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
+  str(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
+  add(top, top, t1);
+  sub(top, top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
+  str(top, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
+  verify_tlab();
+  b(retry);
+
+  return rthread; // for use by caller
+}
+
+// Defines obj, preserves var_size_in_bytes
+void MacroAssembler::eden_allocate(Register obj,
+                                   Register var_size_in_bytes,
+                                   int con_size_in_bytes,
+                                   Register t1,
+                                   Label& slow_case) {
+  assert_different_registers(obj, var_size_in_bytes, t1);
+  if (!Universe::heap()->supports_inline_contig_alloc()) {
+    b(slow_case);
+  } else {
+    Register end = t1;
+    Register heap_end = rscratch2;
+    Label retry;
+    bind(retry);
+    {
+      unsigned long offset;
+      adrp(rscratch1, ExternalAddress((address) Universe::heap()->end_addr()), offset);
+      ldr(heap_end, Address(rscratch1, offset));
+    }
+
+    ExternalAddress heap_top((address) Universe::heap()->top_addr());
+
+    // Get the current top of the heap
+    {
+      unsigned long offset;
+      adrp(rscratch1, heap_top, offset);
+      // Use add() here after ARDP, rather than lea().
+      // lea() does not generate anything if its offset is zero.
+      // However, relocs expect to find either an ADD or a load/store
+      // insn after an ADRP.  add() always generates an ADD insn, even
+      // for add(Rn, Rn, 0).
+      add(rscratch1, rscratch1, offset);
+      ldaxr(obj, rscratch1);
+    }
+
+    // Adjust it my the size of our new object
+    if (var_size_in_bytes == noreg) {
+      lea(end, Address(obj, con_size_in_bytes));
+    } else {
+      lea(end, Address(obj, var_size_in_bytes));
+    }
+
+    // if end < obj then we wrapped around high memory
+    cmp(end, obj);
+    br(Assembler::LO, slow_case);
+
+    cmp(end, heap_end);
+    br(Assembler::HI, slow_case);
+
+    // If heap_top hasn't been changed by some other thread, update it.
+    stlxr(rscratch1, end, rscratch1);
+    cbnzw(rscratch1, retry);
+  }
+}
+
+void MacroAssembler::verify_tlab() {
+#ifdef ASSERT
+  if (UseTLAB && VerifyOops) {
+    Label next, ok;
+
+    stp(rscratch2, rscratch1, Address(pre(sp, -16)));
+
+    ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
+    ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
+    cmp(rscratch2, rscratch1);
+    br(Assembler::HS, next);
+    STOP("assert(top >= start)");
+    should_not_reach_here();
+
+    bind(next);
+    ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
+    ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
+    cmp(rscratch2, rscratch1);
+    br(Assembler::HS, ok);
+    STOP("assert(top <= end)");
+    should_not_reach_here();
+
+    bind(ok);
+    ldp(rscratch2, rscratch1, Address(post(sp, 16)));
+  }
+#endif
+}
+
+// Writes to stack successive pages until offset reached to check for
+// stack overflow + shadow pages.  This clobbers tmp.
+void MacroAssembler::bang_stack_size(Register size, Register tmp) {
+  assert_different_registers(tmp, size, rscratch1);
+  mov(tmp, sp);
+  // Bang stack for total size given plus shadow page size.
+  // Bang one page at a time because large size can bang beyond yellow and
+  // red zones.
+  Label loop;
+  mov(rscratch1, os::vm_page_size());
+  bind(loop);
+  lea(tmp, Address(tmp, -os::vm_page_size()));
+  subsw(size, size, rscratch1);
+  str(size, Address(tmp));
+  br(Assembler::GT, loop);
+
+  // Bang down shadow pages too.
+  // At this point, (tmp-0) is the last address touched, so don't
+  // touch it again.  (It was touched as (tmp-pagesize) but then tmp
+  // was post-decremented.)  Skip this address by starting at i=1, and
+  // touch a few more pages below.  N.B.  It is important to touch all
+  // the way down to and including i=StackShadowPages.
+  for (int i = 0; i< StackShadowPages-1; i++) {
+    // this could be any sized move but this is can be a debugging crumb
+    // so the bigger the better.
+    lea(tmp, Address(tmp, -os::vm_page_size()));
+    str(size, Address(tmp));
+  }
+}
+
+
+address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
+  unsigned long off;
+  adrp(r, Address(page, rtype), off);
+  InstructionMark im(this);
+  code_section()->relocate(inst_mark(), rtype);
+  ldrw(zr, Address(r, off));
+  return inst_mark();
+}
+
+address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
+  InstructionMark im(this);
+  code_section()->relocate(inst_mark(), rtype);
+  ldrw(zr, Address(r, 0));
+  return inst_mark();
+}
+
+void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
+  relocInfo::relocType rtype = dest.rspec().reloc()->type();
+  if (uabs(pc() - dest.target()) >= (1LL << 32)) {
+    guarantee(rtype == relocInfo::none
+              || rtype == relocInfo::external_word_type
+              || rtype == relocInfo::poll_type
+              || rtype == relocInfo::poll_return_type,
+              "can only use a fixed address with an ADRP");
+    // Out of range.  This doesn't happen very often, but we have to
+    // handle it
+    mov(reg1, dest);
+    byte_offset = 0;
+  } else {
+    InstructionMark im(this);
+    code_section()->relocate(inst_mark(), dest.rspec());
+    byte_offset = (uint64_t)dest.target() & 0xfff;
+    _adrp(reg1, dest.target());
+  }
+}
+
+  bool MacroAssembler::use_acq_rel_for_volatile_fields() {
+#ifdef PRODUCT
+    return false;
+#else
+    return UseAcqRelForVolatileFields;
+#endif
+  }
+
+void MacroAssembler::build_frame(int framesize) {
+  if (framesize == 0) {
+    // Is this even possible?
+    stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
+  } else if (framesize < ((1 << 9) + 2 * wordSize)) {
+    sub(sp, sp, framesize);
+    stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
+  } else {
+    stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
+    if (framesize < ((1 << 12) + 2 * wordSize))
+      sub(sp, sp, framesize - 2 * wordSize);
+    else {
+      mov(rscratch1, framesize - 2 * wordSize);
+      sub(sp, sp, rscratch1);
+    }
+  }
+}
+
+void MacroAssembler::remove_frame(int framesize) {
+  if (framesize == 0) {
+    ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
+  } else if (framesize < ((1 << 9) + 2 * wordSize)) {
+    ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
+    add(sp, sp, framesize);
+  } else {
+    if (framesize < ((1 << 12) + 2 * wordSize))
+      add(sp, sp, framesize - 2 * wordSize);
+    else {
+      mov(rscratch1, framesize - 2 * wordSize);
+      add(sp, sp, rscratch1);
+    }
+    ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
+  }
+}
+
+
+// Search for str1 in str2 and return index or -1
+void MacroAssembler::string_indexof(Register str2, Register str1,
+                                    Register cnt2, Register cnt1,
+                                    Register tmp1, Register tmp2,
+                                    Register tmp3, Register tmp4,
+                                    int icnt1, Register result) {
+  Label BM, LINEARSEARCH, DONE, NOMATCH, MATCH;
+
+  Register ch1 = rscratch1;
+  Register ch2 = rscratch2;
+  Register cnt1tmp = tmp1;
+  Register cnt2tmp = tmp2;
+  Register cnt1_neg = cnt1;
+  Register cnt2_neg = cnt2;
+  Register result_tmp = tmp4;
+
+  // Note, inline_string_indexOf() generates checks:
+  // if (substr.count > string.count) return -1;
+  // if (substr.count == 0) return 0;
+
+// We have two strings, a source string in str2, cnt2 and a pattern string
+// in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
+
+// For larger pattern and source we use a simplified Boyer Moore algorithm.
+// With a small pattern and source we use linear scan.
+
+  if (icnt1 == -1) {
+    cmp(cnt1, 256);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
+    ccmp(cnt1, 8, 0b0000, LO);  // Can't handle skip >= 256 because we use
+    br(LO, LINEARSEARCH);       // a byte array.
+    cmp(cnt1, cnt2, LSR, 2);    // Source must be 4 * pattern for BM
+    br(HS, LINEARSEARCH);
+  }
+
+// The Boyer Moore alogorithm is based on the description here:-
+//
+// http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
+//
+// This describes and algorithm with 2 shift rules. The 'Bad Character' rule
+// and the 'Good Suffix' rule.
+//
+// These rules are essentially heuristics for how far we can shift the
+// pattern along the search string.
+//
+// The implementation here uses the 'Bad Character' rule only because of the
+// complexity of initialisation for the 'Good Suffix' rule.
+//
+// This is also known as the Boyer-Moore-Horspool algorithm:-
+//
+// http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
+//
+// #define ASIZE 128
+//
+//    int bm(unsigned char *x, int m, unsigned char *y, int n) {
+//       int i, j;
+//       unsigned c;
+//       unsigned char bc[ASIZE];
+//
+//       /* Preprocessing */
+//       for (i = 0; i < ASIZE; ++i)
+//          bc[i] = 0;
+//       for (i = 0; i < m - 1; ) {
+//          c = x[i];
+//          ++i;
+//          if (c < ASIZE) bc[c] = i;
+//       }
+//
+//       /* Searching */
+//       j = 0;
+//       while (j <= n - m) {
+//          c = y[i+j];
+//          if (x[m-1] == c)
+//            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
+//          if (i < 0) return j;
+//          if (c < ASIZE)
+//            j = j - bc[y[j+m-1]] + m;
+//          else
+//            j += 1; // Advance by 1 only if char >= ASIZE
+//       }
+//    }
+
+  if (icnt1 == -1) {
+    BIND(BM);
+
+    Label ZLOOP, BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP;
+    Label BMADV, BMMATCH, BMCHECKEND;
+
+    Register cnt1end = tmp2;
+    Register str2end = cnt2;
+    Register skipch = tmp2;
+
+    // Restrict ASIZE to 128 to reduce stack space/initialisation.
+    // The presence of chars >= ASIZE in the target string does not affect
+    // performance, but we must be careful not to initialise them in the stack
+    // array.
+    // The presence of chars >= ASIZE in the source string may adversely affect
+    // performance since we can only advance by one when we encounter one.
+
+      stp(zr, zr, pre(sp, -128));
+      for (int i = 1; i < 8; i++)
+          stp(zr, zr, Address(sp, i*16));
+
+      mov(cnt1tmp, 0);
+      sub(cnt1end, cnt1, 1);
+    BIND(BCLOOP);
+      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
+      cmp(ch1, 128);
+      add(cnt1tmp, cnt1tmp, 1);
+      br(HS, BCSKIP);
+      strb(cnt1tmp, Address(sp, ch1));
+    BIND(BCSKIP);
+      cmp(cnt1tmp, cnt1end);
+      br(LT, BCLOOP);
+
+      mov(result_tmp, str2);
+
+      sub(cnt2, cnt2, cnt1);
+      add(str2end, str2, cnt2, LSL, 1);
+    BIND(BMLOOPSTR2);
+      sub(cnt1tmp, cnt1, 1);
+      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
+      ldrh(skipch, Address(str2, cnt1tmp, Address::lsl(1)));
+      cmp(ch1, skipch);
+      br(NE, BMSKIP);
+      subs(cnt1tmp, cnt1tmp, 1);
+      br(LT, BMMATCH);
+    BIND(BMLOOPSTR1);
+      ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
+      ldrh(ch2, Address(str2, cnt1tmp, Address::lsl(1)));
+      cmp(ch1, ch2);
+      br(NE, BMSKIP);
+      subs(cnt1tmp, cnt1tmp, 1);
+      br(GE, BMLOOPSTR1);
+    BIND(BMMATCH);
+      sub(result_tmp, str2, result_tmp);
+      lsr(result, result_tmp, 1);
+      add(sp, sp, 128);
+      b(DONE);
+    BIND(BMADV);
+      add(str2, str2, 2);
+      b(BMCHECKEND);
+    BIND(BMSKIP);
+      cmp(skipch, 128);
+      br(HS, BMADV);
+      ldrb(ch2, Address(sp, skipch));
+      add(str2, str2, cnt1, LSL, 1);
+      sub(str2, str2, ch2, LSL, 1);
+    BIND(BMCHECKEND);
+      cmp(str2, str2end);
+      br(LE, BMLOOPSTR2);
+      add(sp, sp, 128);
+      b(NOMATCH);
+  }
+
+  BIND(LINEARSEARCH);
+  {
+    Label DO1, DO2, DO3;
+
+    Register str2tmp = tmp2;
+    Register first = tmp3;
+
+    if (icnt1 == -1)
+    {
+        Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT, LAST_WORD;
+
+        cmp(cnt1, 4);
+        br(LT, DOSHORT);
+
+        sub(cnt2, cnt2, cnt1);
+        sub(cnt1, cnt1, 4);
+        mov(result_tmp, cnt2);
+
+        lea(str1, Address(str1, cnt1, Address::uxtw(1)));
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt1_neg, zr, cnt1, LSL, 1);
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+        ldr(first, Address(str1, cnt1_neg));
+
+      BIND(FIRST_LOOP);
+        ldr(ch2, Address(str2, cnt2_neg));
+        cmp(first, ch2);
+        br(EQ, STR1_LOOP);
+      BIND(STR2_NEXT);
+        adds(cnt2_neg, cnt2_neg, 2);
+        br(LE, FIRST_LOOP);
+        b(NOMATCH);
+
+      BIND(STR1_LOOP);
+        adds(cnt1tmp, cnt1_neg, 8);
+        add(cnt2tmp, cnt2_neg, 8);
+        br(GE, LAST_WORD);
+
+      BIND(STR1_NEXT);
+        ldr(ch1, Address(str1, cnt1tmp));
+        ldr(ch2, Address(str2, cnt2tmp));
+        cmp(ch1, ch2);
+        br(NE, STR2_NEXT);
+        adds(cnt1tmp, cnt1tmp, 8);
+        add(cnt2tmp, cnt2tmp, 8);
+        br(LT, STR1_NEXT);
+
+      BIND(LAST_WORD);
+        ldr(ch1, Address(str1));
+        sub(str2tmp, str2, cnt1_neg);         // adjust to corresponding
+        ldr(ch2, Address(str2tmp, cnt2_neg)); // word in str2
+        cmp(ch1, ch2);
+        br(NE, STR2_NEXT);
+        b(MATCH);
+
+      BIND(DOSHORT);
+        cmp(cnt1, 2);
+        br(LT, DO1);
+        br(GT, DO3);
+    }
+
+    if (icnt1 == 4) {
+      Label CH1_LOOP;
+
+        ldr(ch1, str1);
+        sub(cnt2, cnt2, 4);
+        mov(result_tmp, cnt2);
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+
+      BIND(CH1_LOOP);
+        ldr(ch2, Address(str2, cnt2_neg));
+        cmp(ch1, ch2);
+        br(EQ, MATCH);
+        adds(cnt2_neg, cnt2_neg, 2);
+        br(LE, CH1_LOOP);
+        b(NOMATCH);
+    }
+
+    if (icnt1 == -1 || icnt1 == 2) {
+      Label CH1_LOOP;
+
+      BIND(DO2);
+        ldrw(ch1, str1);
+        sub(cnt2, cnt2, 2);
+        mov(result_tmp, cnt2);
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+
+      BIND(CH1_LOOP);
+        ldrw(ch2, Address(str2, cnt2_neg));
+        cmp(ch1, ch2);
+        br(EQ, MATCH);
+        adds(cnt2_neg, cnt2_neg, 2);
+        br(LE, CH1_LOOP);
+        b(NOMATCH);
+    }
+
+    if (icnt1 == -1 || icnt1 == 3) {
+      Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
+
+      BIND(DO3);
+        ldrw(first, str1);
+        ldrh(ch1, Address(str1, 4));
+
+        sub(cnt2, cnt2, 3);
+        mov(result_tmp, cnt2);
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+
+      BIND(FIRST_LOOP);
+        ldrw(ch2, Address(str2, cnt2_neg));
+        cmpw(first, ch2);
+        br(EQ, STR1_LOOP);
+      BIND(STR2_NEXT);
+        adds(cnt2_neg, cnt2_neg, 2);
+        br(LE, FIRST_LOOP);
+        b(NOMATCH);
+
+      BIND(STR1_LOOP);
+        add(cnt2tmp, cnt2_neg, 4);
+        ldrh(ch2, Address(str2, cnt2tmp));
+        cmp(ch1, ch2);
+        br(NE, STR2_NEXT);
+        b(MATCH);
+    }
+
+    if (icnt1 == -1 || icnt1 == 1) {
+      Label CH1_LOOP, HAS_ZERO;
+      Label DO1_SHORT, DO1_LOOP;
+
+      BIND(DO1);
+        ldrh(ch1, str1);
+        cmp(cnt2, 4);
+        br(LT, DO1_SHORT);
+
+        orr(ch1, ch1, ch1, LSL, 16);
+        orr(ch1, ch1, ch1, LSL, 32);
+
+        sub(cnt2, cnt2, 4);
+        mov(result_tmp, cnt2);
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+
+        mov(tmp3, 0x0001000100010001);
+      BIND(CH1_LOOP);
+        ldr(ch2, Address(str2, cnt2_neg));
+        eor(ch2, ch1, ch2);
+        sub(tmp1, ch2, tmp3);
+        orr(tmp2, ch2, 0x7fff7fff7fff7fff);
+        bics(tmp1, tmp1, tmp2);
+        br(NE, HAS_ZERO);
+        adds(cnt2_neg, cnt2_neg, 8);
+        br(LT, CH1_LOOP);
+
+        cmp(cnt2_neg, 8);
+        mov(cnt2_neg, 0);
+        br(LT, CH1_LOOP);
+        b(NOMATCH);
+
+      BIND(HAS_ZERO);
+        rev(tmp1, tmp1);
+        clz(tmp1, tmp1);
+        add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
+        b(MATCH);
+
+      BIND(DO1_SHORT);
+        mov(result_tmp, cnt2);
+        lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+        sub(cnt2_neg, zr, cnt2, LSL, 1);
+      BIND(DO1_LOOP);
+        ldrh(ch2, Address(str2, cnt2_neg));
+        cmpw(ch1, ch2);
+        br(EQ, MATCH);
+        adds(cnt2_neg, cnt2_neg, 2);
+        br(LT, DO1_LOOP);
+    }
+  }
+  BIND(NOMATCH);
+    mov(result, -1);
+    b(DONE);
+  BIND(MATCH);
+    add(result, result_tmp, cnt2_neg, ASR, 1);
+  BIND(DONE);
+}
+
+// Compare strings.
+void MacroAssembler::string_compare(Register str1, Register str2,
+                                    Register cnt1, Register cnt2, Register result,
+                                    Register tmp1) {
+  Label LENGTH_DIFF, DONE, SHORT_LOOP, SHORT_STRING,
+    NEXT_WORD, DIFFERENCE;
+
+  BLOCK_COMMENT("string_compare {");
+
+  // Compute the minimum of the string lengths and save the difference.
+  subsw(tmp1, cnt1, cnt2);
+  cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
+
+  // A very short string
+  cmpw(cnt2, 4);
+  br(Assembler::LT, SHORT_STRING);
+
+  // Check if the strings start at the same location.
+  cmp(str1, str2);
+  br(Assembler::EQ, LENGTH_DIFF);
+
+  // Compare longwords
+  {
+    subw(cnt2, cnt2, 4); // The last longword is a special case
+
+    // Move both string pointers to the last longword of their
+    // strings, negate the remaining count, and convert it to bytes.
+    lea(str1, Address(str1, cnt2, Address::uxtw(1)));
+    lea(str2, Address(str2, cnt2, Address::uxtw(1)));
+    sub(cnt2, zr, cnt2, LSL, 1);
+
+    // Loop, loading longwords and comparing them into rscratch2.
+    bind(NEXT_WORD);
+    ldr(result, Address(str1, cnt2));
+    ldr(cnt1, Address(str2, cnt2));
+    adds(cnt2, cnt2, wordSize);
+    eor(rscratch2, result, cnt1);
+    cbnz(rscratch2, DIFFERENCE);
+    br(Assembler::LT, NEXT_WORD);
+
+    // Last longword.  In the case where length == 4 we compare the
+    // same longword twice, but that's still faster than another
+    // conditional branch.
+
+    ldr(result, Address(str1));
+    ldr(cnt1, Address(str2));
+    eor(rscratch2, result, cnt1);
+    cbz(rscratch2, LENGTH_DIFF);
+
+    // Find the first different characters in the longwords and
+    // compute their difference.
+    bind(DIFFERENCE);
+    rev(rscratch2, rscratch2);
+    clz(rscratch2, rscratch2);
+    andr(rscratch2, rscratch2, -16);
+    lsrv(result, result, rscratch2);
+    uxthw(result, result);
+    lsrv(cnt1, cnt1, rscratch2);
+    uxthw(cnt1, cnt1);
+    subw(result, result, cnt1);
+    b(DONE);
+  }
+
+  bind(SHORT_STRING);
+  // Is the minimum length zero?
+  cbz(cnt2, LENGTH_DIFF);
+
+  bind(SHORT_LOOP);
+  load_unsigned_short(result, Address(post(str1, 2)));
+  load_unsigned_short(cnt1, Address(post(str2, 2)));
+  subw(result, result, cnt1);
+  cbnz(result, DONE);
+  sub(cnt2, cnt2, 1);
+  cbnz(cnt2, SHORT_LOOP);
+
+  // Strings are equal up to min length.  Return the length difference.
+  bind(LENGTH_DIFF);
+  mov(result, tmp1);
+
+  // That's it
+  bind(DONE);
+
+  BLOCK_COMMENT("} string_compare");
+}
+
+
+void MacroAssembler::string_equals(Register str1, Register str2,
+                                   Register cnt, Register result,
+                                   Register tmp1) {
+  Label SAME_CHARS, DONE, SHORT_LOOP, SHORT_STRING,
+    NEXT_WORD;
+
+  const Register tmp2 = rscratch1;
+  assert_different_registers(str1, str2, cnt, result, tmp1, tmp2, rscratch2);
+
+  BLOCK_COMMENT("string_equals {");
+
+  // Start by assuming that the strings are not equal.
+  mov(result, zr);
+
+  // A very short string
+  cmpw(cnt, 4);
+  br(Assembler::LT, SHORT_STRING);
+
+  // Check if the strings start at the same location.
+  cmp(str1, str2);
+  br(Assembler::EQ, SAME_CHARS);
+
+  // Compare longwords
+  {
+    subw(cnt, cnt, 4); // The last longword is a special case
+
+    // Move both string pointers to the last longword of their
+    // strings, negate the remaining count, and convert it to bytes.
+    lea(str1, Address(str1, cnt, Address::uxtw(1)));
+    lea(str2, Address(str2, cnt, Address::uxtw(1)));
+    sub(cnt, zr, cnt, LSL, 1);
+
+    // Loop, loading longwords and comparing them into rscratch2.
+    bind(NEXT_WORD);
+    ldr(tmp1, Address(str1, cnt));
+    ldr(tmp2, Address(str2, cnt));
+    adds(cnt, cnt, wordSize);
+    eor(rscratch2, tmp1, tmp2);
+    cbnz(rscratch2, DONE);
+    br(Assembler::LT, NEXT_WORD);
+
+    // Last longword.  In the case where length == 4 we compare the
+    // same longword twice, but that's still faster than another
+    // conditional branch.
+
+    ldr(tmp1, Address(str1));
+    ldr(tmp2, Address(str2));
+    eor(rscratch2, tmp1, tmp2);
+    cbz(rscratch2, SAME_CHARS);
+    b(DONE);
+  }
+
+  bind(SHORT_STRING);
+  // Is the length zero?
+  cbz(cnt, SAME_CHARS);
+
+  bind(SHORT_LOOP);
+  load_unsigned_short(tmp1, Address(post(str1, 2)));
+  load_unsigned_short(tmp2, Address(post(str2, 2)));
+  subw(tmp1, tmp1, tmp2);
+  cbnz(tmp1, DONE);
+  sub(cnt, cnt, 1);
+  cbnz(cnt, SHORT_LOOP);
+
+  // Strings are equal.
+  bind(SAME_CHARS);
+  mov(result, true);
+
+  // That's it
+  bind(DONE);
+
+  BLOCK_COMMENT("} string_equals");
+}
+
+// Compare char[] arrays aligned to 4 bytes
+void MacroAssembler::char_arrays_equals(Register ary1, Register ary2,
+                                        Register result, Register tmp1)
+{
+  Register cnt1 = rscratch1;
+  Register cnt2 = rscratch2;
+  Register tmp2 = rscratch2;
+
+  Label SAME, DIFFER, NEXT, TAIL03, TAIL01;
+
+  int length_offset  = arrayOopDesc::length_offset_in_bytes();
+  int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
+
+  BLOCK_COMMENT("char_arrays_equals  {");
+
+    // different until proven equal
+    mov(result, false);
+
+    // same array?
+    cmp(ary1, ary2);
+    br(Assembler::EQ, SAME);
+
+    // ne if either null
+    cbz(ary1, DIFFER);
+    cbz(ary2, DIFFER);
+
+    // lengths ne?
+    ldrw(cnt1, Address(ary1, length_offset));
+    ldrw(cnt2, Address(ary2, length_offset));
+    cmp(cnt1, cnt2);
+    br(Assembler::NE, DIFFER);
+
+    lea(ary1, Address(ary1, base_offset));
+    lea(ary2, Address(ary2, base_offset));
+
+    subs(cnt1, cnt1, 4);
+    br(LT, TAIL03);
+
+  BIND(NEXT);
+    ldr(tmp1, Address(post(ary1, 8)));
+    ldr(tmp2, Address(post(ary2, 8)));
+    subs(cnt1, cnt1, 4);
+    eor(tmp1, tmp1, tmp2);
+    cbnz(tmp1, DIFFER);
+    br(GE, NEXT);
+
+  BIND(TAIL03);  // 0-3 chars left, cnt1 = #chars left - 4
+    tst(cnt1, 0b10);
+    br(EQ, TAIL01);
+    ldrw(tmp1, Address(post(ary1, 4)));
+    ldrw(tmp2, Address(post(ary2, 4)));
+    cmp(tmp1, tmp2);
+    br(NE, DIFFER);
+  BIND(TAIL01);  // 0-1 chars left
+    tst(cnt1, 0b01);
+    br(EQ, SAME);
+    ldrh(tmp1, ary1);
+    ldrh(tmp2, ary2);
+    cmp(tmp1, tmp2);
+    br(NE, DIFFER);
+
+  BIND(SAME);
+    mov(result, true);
+  BIND(DIFFER); // result already set
+
+  BLOCK_COMMENT("} char_arrays_equals");
+}
+
+// encode char[] to byte[] in ISO_8859_1
+void MacroAssembler::encode_iso_array(Register src, Register dst,
+                      Register len, Register result,
+                      FloatRegister Vtmp1, FloatRegister Vtmp2,
+                      FloatRegister Vtmp3, FloatRegister Vtmp4)
+{
+    Label DONE, NEXT_32, LOOP_8, NEXT_8, LOOP_1, NEXT_1;
+    Register tmp1 = rscratch1;
+
+      mov(result, len); // Save initial len
+
+#ifndef BUILTIN_SIM
+      subs(len, len, 32);
+      br(LT, LOOP_8);
+
+// The following code uses the SIMD 'uqxtn' and 'uqxtn2' instructions
+// to convert chars to bytes. These set the 'QC' bit in the FPSR if
+// any char could not fit in a byte, so clear the FPSR so we can test it.
+      clear_fpsr();
+
+    BIND(NEXT_32);
+      ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
+      uqxtn(Vtmp1, T8B, Vtmp1, T8H);  // uqxtn  - write bottom half
+      uqxtn(Vtmp1, T16B, Vtmp2, T8H); // uqxtn2 - write top half
+      uqxtn(Vtmp2, T8B, Vtmp3, T8H);
+      uqxtn(Vtmp2, T16B, Vtmp4, T8H); // uqxtn2
+      get_fpsr(tmp1);
+      cbnzw(tmp1, LOOP_8);
+      st1(Vtmp1, Vtmp2, T16B, post(dst, 32));
+      subs(len, len, 32);
+      add(src, src, 64);
+      br(GE, NEXT_32);
+
+    BIND(LOOP_8);
+      adds(len, len, 32-8);
+      br(LT, LOOP_1);
+      clear_fpsr(); // QC may be set from loop above, clear again
+    BIND(NEXT_8);
+      ld1(Vtmp1, T8H, src);
+      uqxtn(Vtmp1, T8B, Vtmp1, T8H);
+      get_fpsr(tmp1);
+      cbnzw(tmp1, LOOP_1);
+      st1(Vtmp1, T8B, post(dst, 8));
+      subs(len, len, 8);
+      add(src, src, 16);
+      br(GE, NEXT_8);
+
+    BIND(LOOP_1);
+      adds(len, len, 8);
+      br(LE, DONE);
+#else
+      cbz(len, DONE);
+#endif
+    BIND(NEXT_1);
+      ldrh(tmp1, Address(post(src, 2)));
+      tst(tmp1, 0xff00);
+      br(NE, DONE);
+      strb(tmp1, Address(post(dst, 1)));
+      subs(len, len, 1);
+      br(GT, NEXT_1);
+
+    BIND(DONE);
+      sub(result, result, len); // Return index where we stopped
+}

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,1162 @@
+/*
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_MACROASSEMBLER_AARCH64_HPP
+#define CPU_AARCH64_VM_MACROASSEMBLER_AARCH64_HPP
+
+#include "asm/assembler.hpp"
+
+// MacroAssembler extends Assembler by frequently used macros.
+//
+// Instructions for which a 'better' code sequence exists depending
+// on arguments should also go in here.
+
+class MacroAssembler: public Assembler {
+  friend class LIR_Assembler;
+
+  using Assembler::mov;
+
+ protected:
+
+  // Support for VM calls
+  //
+  // This is the base routine called by the different versions of call_VM_leaf. The interpreter
+  // may customize this version by overriding it for its purposes (e.g., to save/restore
+  // additional registers when doing a VM call).
+#ifdef CC_INTERP
+  // c++ interpreter never wants to use interp_masm version of call_VM
+  #define VIRTUAL
+#else
+  #define VIRTUAL virtual
+#endif
+
+  VIRTUAL void call_VM_leaf_base(
+    address entry_point,               // the entry point
+    int     number_of_arguments,        // the number of arguments to pop after the call
+    Label *retaddr = NULL
+  );
+
+  VIRTUAL void call_VM_leaf_base(
+    address entry_point,               // the entry point
+    int     number_of_arguments,        // the number of arguments to pop after the call
+    Label &retaddr) {
+    call_VM_leaf_base(entry_point, number_of_arguments, &retaddr);
+  }
+
+  // This is the base routine called by the different versions of call_VM. The interpreter
+  // may customize this version by overriding it for its purposes (e.g., to save/restore
+  // additional registers when doing a VM call).
+  //
+  // If no java_thread register is specified (noreg) than rthread will be used instead. call_VM_base
+  // returns the register which contains the thread upon return. If a thread register has been
+  // specified, the return value will correspond to that register. If no last_java_sp is specified
+  // (noreg) than rsp will be used instead.
+  VIRTUAL void call_VM_base(           // returns the register containing the thread upon return
+    Register oop_result,               // where an oop-result ends up if any; use noreg otherwise
+    Register java_thread,              // the thread if computed before     ; use noreg otherwise
+    Register last_java_sp,             // to set up last_Java_frame in stubs; use noreg otherwise
+    address  entry_point,              // the entry point
+    int      number_of_arguments,      // the number of arguments (w/o thread) to pop after the call
+    bool     check_exceptions          // whether to check for pending exceptions after return
+  );
+
+  // These routines should emit JVMTI PopFrame and ForceEarlyReturn handling code.
+  // The implementation is only non-empty for the InterpreterMacroAssembler,
+  // as only the interpreter handles PopFrame and ForceEarlyReturn requests.
+  virtual void check_and_handle_popframe(Register java_thread);
+  virtual void check_and_handle_earlyret(Register java_thread);
+
+  void call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions = true);
+
+  // Maximum size of class area in Metaspace when compressed
+  uint64_t use_XOR_for_compressed_class_base;
+
+ public:
+  MacroAssembler(CodeBuffer* code) : Assembler(code) {
+    use_XOR_for_compressed_class_base
+      = (operand_valid_for_logical_immediate(false /*is32*/,
+                                             (uint64_t)Universe::narrow_klass_base())
+         && ((uint64_t)Universe::narrow_klass_base()
+             > (1u << log2_intptr(CompressedClassSpaceSize))));
+  }
+
+  // Biased locking support
+  // lock_reg and obj_reg must be loaded up with the appropriate values.
+  // swap_reg is killed.
+  // tmp_reg is optional. If it is supplied (i.e., != noreg) it will
+  // be killed; if not supplied, push/pop will be used internally to
+  // allocate a temporary (inefficient, avoid if possible).
+  // Optional slow case is for implementations (interpreter and C1) which branch to
+  // slow case directly. Leaves condition codes set for C2's Fast_Lock node.
+  // Returns offset of first potentially-faulting instruction for null
+  // check info (currently consumed only by C1). If
+  // swap_reg_contains_mark is true then returns -1 as it is assumed
+  // the calling code has already passed any potential faults.
+  int biased_locking_enter(Register lock_reg, Register obj_reg,
+                           Register swap_reg, Register tmp_reg,
+                           bool swap_reg_contains_mark,
+                           Label& done, Label* slow_case = NULL,
+                           BiasedLockingCounters* counters = NULL);
+  void biased_locking_exit (Register obj_reg, Register temp_reg, Label& done);
+
+
+  // Helper functions for statistics gathering.
+  // Unconditional atomic increment.
+  void atomic_incw(Register counter_addr, Register tmp);
+  void atomic_incw(Address counter_addr, Register tmp1, Register tmp2) {
+    lea(tmp1, counter_addr);
+    atomic_incw(tmp1, tmp2);
+  }
+  // Load Effective Address
+  void lea(Register r, const Address &a) {
+    InstructionMark im(this);
+    code_section()->relocate(inst_mark(), a.rspec());
+    a.lea(this, r);
+  }
+
+  void addmw(Address a, Register incr, Register scratch) {
+    ldrw(scratch, a);
+    addw(scratch, scratch, incr);
+    strw(scratch, a);
+  }
+
+  // Add constant to memory word
+  void addmw(Address a, int imm, Register scratch) {
+    ldrw(scratch, a);
+    if (imm > 0)
+      addw(scratch, scratch, (unsigned)imm);
+    else
+      subw(scratch, scratch, (unsigned)-imm);
+    strw(scratch, a);
+  }
+
+  // Frame creation and destruction shared between JITs.
+  void build_frame(int framesize);
+  void remove_frame(int framesize);
+
+  virtual void _call_Unimplemented(address call_site) {
+    mov(rscratch2, call_site);
+    haltsim();
+  }
+
+#define call_Unimplemented() _call_Unimplemented((address)__PRETTY_FUNCTION__)
+
+  virtual void notify(int type);
+
+  // aliases defined in AARCH64 spec
+
+
+  template<class T>
+  inline void  cmpw(Register Rd, T imm)  { subsw(zr, Rd, imm); }
+  inline void cmp(Register Rd, unsigned imm)  { subs(zr, Rd, imm); }
+
+  inline void cmnw(Register Rd, unsigned imm) { addsw(zr, Rd, imm); }
+  inline void cmn(Register Rd, unsigned imm) { adds(zr, Rd, imm); }
+
+  void cset(Register Rd, Assembler::Condition cond) {
+    csinc(Rd, zr, zr, ~cond);
+  }
+  void csetw(Register Rd, Assembler::Condition cond) {
+    csincw(Rd, zr, zr, ~cond);
+  }
+
+  void cneg(Register Rd, Register Rn, Assembler::Condition cond) {
+    csneg(Rd, Rn, Rn, ~cond);
+  }
+  void cnegw(Register Rd, Register Rn, Assembler::Condition cond) {
+    csnegw(Rd, Rn, Rn, ~cond);
+  }
+
+  inline void movw(Register Rd, Register Rn) {
+    if (Rd == sp || Rn == sp) {
+      addw(Rd, Rn, 0U);
+    } else {
+      orrw(Rd, zr, Rn);
+    }
+  }
+  inline void mov(Register Rd, Register Rn) {
+    assert(Rd != r31_sp && Rn != r31_sp, "should be");
+    if (Rd == Rn) {
+    } else if (Rd == sp || Rn == sp) {
+      add(Rd, Rn, 0U);
+    } else {
+      orr(Rd, zr, Rn);
+    }
+  }
+
+  inline void moviw(Register Rd, unsigned imm) { orrw(Rd, zr, imm); }
+  inline void movi(Register Rd, unsigned imm) { orr(Rd, zr, imm); }
+
+  inline void tstw(Register Rd, unsigned imm) { andsw(zr, Rd, imm); }
+  inline void tst(Register Rd, unsigned imm) { ands(zr, Rd, imm); }
+
+  inline void bfiw(Register Rd, Register Rn, unsigned lsb, unsigned width) {
+    bfmw(Rd, Rn, ((32 - lsb) & 31), (width - 1));
+  }
+  inline void bfi(Register Rd, Register Rn, unsigned lsb, unsigned width) {
+    bfm(Rd, Rn, ((64 - lsb) & 63), (width - 1));
+  }
+
+  inline void bfxilw(Register Rd, Register Rn, unsigned lsb, unsigned width) {
+    bfmw(Rd, Rn, lsb, (lsb + width - 1));
+  }
+  inline void bfxil(Register Rd, Register Rn, unsigned lsb, unsigned width) {
+    bfm(Rd, Rn, lsb , (lsb + width - 1));
+  }
+
+  inline void sbfizw(Register Rd, Register Rn, unsigned lsb, unsigned width) {
+    sbfmw(Rd, Rn, ((32 - lsb) & 31), (width - 1));
+  }
+  inline void sbfiz(Register Rd, Register Rn, unsigned lsb, unsigned width) {
+    sbfm(Rd, Rn, ((64 - lsb) & 63), (width - 1));
+  }
+
+  inline void sbfxw(Register Rd, Register Rn, unsigned lsb, unsigned width) {
+    sbfmw(Rd, Rn, lsb, (lsb + width - 1));
+  }
+  inline void sbfx(Register Rd, Register Rn, unsigned lsb, unsigned width) {
+    sbfm(Rd, Rn, lsb , (lsb + width - 1));
+  }
+
+  inline void ubfizw(Register Rd, Register Rn, unsigned lsb, unsigned width) {
+    ubfmw(Rd, Rn, ((32 - lsb) & 31), (width - 1));
+  }
+  inline void ubfiz(Register Rd, Register Rn, unsigned lsb, unsigned width) {
+    ubfm(Rd, Rn, ((64 - lsb) & 63), (width - 1));
+  }
+
+  inline void ubfxw(Register Rd, Register Rn, unsigned lsb, unsigned width) {
+    ubfmw(Rd, Rn, lsb, (lsb + width - 1));
+  }
+  inline void ubfx(Register Rd, Register Rn, unsigned lsb, unsigned width) {
+    ubfm(Rd, Rn, lsb , (lsb + width - 1));
+  }
+
+  inline void asrw(Register Rd, Register Rn, unsigned imm) {
+    sbfmw(Rd, Rn, imm, 31);
+  }
+
+  inline void asr(Register Rd, Register Rn, unsigned imm) {
+    sbfm(Rd, Rn, imm, 63);
+  }
+
+  inline void lslw(Register Rd, Register Rn, unsigned imm) {
+    ubfmw(Rd, Rn, ((32 - imm) & 31), (31 - imm));
+  }
+
+  inline void lsl(Register Rd, Register Rn, unsigned imm) {
+    ubfm(Rd, Rn, ((64 - imm) & 63), (63 - imm));
+  }
+
+  inline void lsrw(Register Rd, Register Rn, unsigned imm) {
+    ubfmw(Rd, Rn, imm, 31);
+  }
+
+  inline void lsr(Register Rd, Register Rn, unsigned imm) {
+    ubfm(Rd, Rn, imm, 63);
+  }
+
+  inline void rorw(Register Rd, Register Rn, unsigned imm) {
+    extrw(Rd, Rn, Rn, imm);
+  }
+
+  inline void ror(Register Rd, Register Rn, unsigned imm) {
+    extr(Rd, Rn, Rn, imm);
+  }
+
+  inline void sxtbw(Register Rd, Register Rn) {
+    sbfmw(Rd, Rn, 0, 7);
+  }
+  inline void sxthw(Register Rd, Register Rn) {
+    sbfmw(Rd, Rn, 0, 15);
+  }
+  inline void sxtb(Register Rd, Register Rn) {
+    sbfm(Rd, Rn, 0, 7);
+  }
+  inline void sxth(Register Rd, Register Rn) {
+    sbfm(Rd, Rn, 0, 15);
+  }
+  inline void sxtw(Register Rd, Register Rn) {
+    sbfm(Rd, Rn, 0, 31);
+  }
+
+  inline void uxtbw(Register Rd, Register Rn) {
+    ubfmw(Rd, Rn, 0, 7);
+  }
+  inline void uxthw(Register Rd, Register Rn) {
+    ubfmw(Rd, Rn, 0, 15);
+  }
+  inline void uxtb(Register Rd, Register Rn) {
+    ubfm(Rd, Rn, 0, 7);
+  }
+  inline void uxth(Register Rd, Register Rn) {
+    ubfm(Rd, Rn, 0, 15);
+  }
+  inline void uxtw(Register Rd, Register Rn) {
+    ubfm(Rd, Rn, 0, 31);
+  }
+
+  inline void cmnw(Register Rn, Register Rm) {
+    addsw(zr, Rn, Rm);
+  }
+  inline void cmn(Register Rn, Register Rm) {
+    adds(zr, Rn, Rm);
+  }
+
+  inline void cmpw(Register Rn, Register Rm) {
+    subsw(zr, Rn, Rm);
+  }
+  inline void cmp(Register Rn, Register Rm) {
+    subs(zr, Rn, Rm);
+  }
+
+  inline void negw(Register Rd, Register Rn) {
+    subw(Rd, zr, Rn);
+  }
+
+  inline void neg(Register Rd, Register Rn) {
+    sub(Rd, zr, Rn);
+  }
+
+  inline void negsw(Register Rd, Register Rn) {
+    subsw(Rd, zr, Rn);
+  }
+
+  inline void negs(Register Rd, Register Rn) {
+    subs(Rd, zr, Rn);
+  }
+
+  inline void cmnw(Register Rn, Register Rm, enum shift_kind kind, unsigned shift = 0) {
+    addsw(zr, Rn, Rm, kind, shift);
+  }
+  inline void cmn(Register Rn, Register Rm, enum shift_kind kind, unsigned shift = 0) {
+    adds(zr, Rn, Rm, kind, shift);
+  }
+
+  inline void cmpw(Register Rn, Register Rm, enum shift_kind kind, unsigned shift = 0) {
+    subsw(zr, Rn, Rm, kind, shift);
+  }
+  inline void cmp(Register Rn, Register Rm, enum shift_kind kind, unsigned shift = 0) {
+    subs(zr, Rn, Rm, kind, shift);
+  }
+
+  inline void negw(Register Rd, Register Rn, enum shift_kind kind, unsigned shift = 0) {
+    subw(Rd, zr, Rn, kind, shift);
+  }
+
+  inline void neg(Register Rd, Register Rn, enum shift_kind kind, unsigned shift = 0) {
+    sub(Rd, zr, Rn, kind, shift);
+  }
+
+  inline void negsw(Register Rd, Register Rn, enum shift_kind kind, unsigned shift = 0) {
+    subsw(Rd, zr, Rn, kind, shift);
+  }
+
+  inline void negs(Register Rd, Register Rn, enum shift_kind kind, unsigned shift = 0) {
+    subs(Rd, zr, Rn, kind, shift);
+  }
+
+  inline void mnegw(Register Rd, Register Rn, Register Rm) {
+    msubw(Rd, Rn, Rm, zr);
+  }
+  inline void mneg(Register Rd, Register Rn, Register Rm) {
+    msub(Rd, Rn, Rm, zr);
+  }
+
+  inline void mulw(Register Rd, Register Rn, Register Rm) {
+    maddw(Rd, Rn, Rm, zr);
+  }
+  inline void mul(Register Rd, Register Rn, Register Rm) {
+    madd(Rd, Rn, Rm, zr);
+  }
+
+  inline void smnegl(Register Rd, Register Rn, Register Rm) {
+    smsubl(Rd, Rn, Rm, zr);
+  }
+  inline void smull(Register Rd, Register Rn, Register Rm) {
+    smaddl(Rd, Rn, Rm, zr);
+  }
+
+  inline void umnegl(Register Rd, Register Rn, Register Rm) {
+    umsubl(Rd, Rn, Rm, zr);
+  }
+  inline void umull(Register Rd, Register Rn, Register Rm) {
+    umaddl(Rd, Rn, Rm, zr);
+  }
+
+  // macro assembly operations needed for aarch64
+
+  // first two private routines for loading 32 bit or 64 bit constants
+private:
+
+  void mov_immediate64(Register dst, u_int64_t imm64);
+  void mov_immediate32(Register dst, u_int32_t imm32);
+
+  int push(unsigned int bitset, Register stack);
+  int pop(unsigned int bitset, Register stack);
+
+  void mov(Register dst, Address a);
+
+public:
+  void push(RegSet regs, Register stack) { if (regs.bits()) push(regs.bits(), stack); }
+  void pop(RegSet regs, Register stack) { if (regs.bits()) pop(regs.bits(), stack); }
+
+  // now mov instructions for loading absolute addresses and 32 or
+  // 64 bit integers
+
+  inline void mov(Register dst, address addr)
+  {
+    mov_immediate64(dst, (u_int64_t)addr);
+  }
+
+  inline void mov(Register dst, u_int64_t imm64)
+  {
+    mov_immediate64(dst, imm64);
+  }
+
+  inline void movw(Register dst, u_int32_t imm32)
+  {
+    mov_immediate32(dst, imm32);
+  }
+
+  inline void mov(Register dst, long l)
+  {
+    mov(dst, (u_int64_t)l);
+  }
+
+  inline void mov(Register dst, int i)
+  {
+    mov(dst, (long)i);
+  }
+
+  void movptr(Register r, uintptr_t imm64);
+
+  // macro instructions for accessing and updating floating point
+  // status register
+  //
+  // FPSR : op1 == 011
+  //        CRn == 0100
+  //        CRm == 0100
+  //        op2 == 001
+
+  inline void get_fpsr(Register reg)
+  {
+    mrs(0b11, 0b0100, 0b0100, 0b001, reg);
+  }
+
+  inline void set_fpsr(Register reg)
+  {
+    msr(0b011, 0b0100, 0b0100, 0b001, reg);
+  }
+
+  inline void clear_fpsr()
+  {
+    msr(0b011, 0b0100, 0b0100, 0b001, zr);
+  }
+
+  // idiv variant which deals with MINLONG as dividend and -1 as divisor
+  int corrected_idivl(Register result, Register ra, Register rb,
+                      bool want_remainder, Register tmp = rscratch1);
+  int corrected_idivq(Register result, Register ra, Register rb,
+                      bool want_remainder, Register tmp = rscratch1);
+
+  // Support for NULL-checks
+  //
+  // Generates code that causes a NULL OS exception if the content of reg is NULL.
+  // If the accessed location is M[reg + offset] and the offset is known, provide the
+  // offset. No explicit code generation is needed if the offset is within a certain
+  // range (0 <= offset <= page_size).
+
+  virtual void null_check(Register reg, int offset = -1);
+  static bool needs_explicit_null_check(intptr_t offset);
+
+  static address target_addr_for_insn(address insn_addr, unsigned insn);
+  static address target_addr_for_insn(address insn_addr) {
+    unsigned insn = *(unsigned*)insn_addr;
+    return target_addr_for_insn(insn_addr, insn);
+  }
+
+  // Required platform-specific helpers for Label::patch_instructions.
+  // They _shadow_ the declarations in AbstractAssembler, which are undefined.
+  static int pd_patch_instruction_size(address branch, address target);
+  static void pd_patch_instruction(address branch, address target) {
+    pd_patch_instruction_size(branch, target);
+  }
+  static address pd_call_destination(address branch) {
+    return target_addr_for_insn(branch);
+  }
+#ifndef PRODUCT
+  static void pd_print_patched_instruction(address branch);
+#endif
+
+  static int patch_oop(address insn_addr, address o);
+
+  void emit_trampoline_stub(int insts_call_instruction_offset, address target);
+
+  // The following 4 methods return the offset of the appropriate move instruction
+
+  // Support for fast byte/short loading with zero extension (depending on particular CPU)
+  int load_unsigned_byte(Register dst, Address src);
+  int load_unsigned_short(Register dst, Address src);
+
+  // Support for fast byte/short loading with sign extension (depending on particular CPU)
+  int load_signed_byte(Register dst, Address src);
+  int load_signed_short(Register dst, Address src);
+
+  int load_signed_byte32(Register dst, Address src);
+  int load_signed_short32(Register dst, Address src);
+
+  // Support for sign-extension (hi:lo = extend_sign(lo))
+  void extend_sign(Register hi, Register lo);
+
+  // Load and store values by size and signed-ness
+  void load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2 = noreg);
+  void store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2 = noreg);
+
+  // Support for inc/dec with optimal instruction selection depending on value
+
+  // x86_64 aliases an unqualified register/address increment and
+  // decrement to call incrementq and decrementq but also supports
+  // explicitly sized calls to incrementq/decrementq or
+  // incrementl/decrementl
+
+  // for aarch64 the proper convention would be to use
+  // increment/decrement for 64 bit operatons and
+  // incrementw/decrementw for 32 bit operations. so when porting
+  // x86_64 code we can leave calls to increment/decrement as is,
+  // replace incrementq/decrementq with increment/decrement and
+  // replace incrementl/decrementl with incrementw/decrementw.
+
+  // n.b. increment/decrement calls with an Address destination will
+  // need to use a scratch register to load the value to be
+  // incremented. increment/decrement calls which add or subtract a
+  // constant value greater than 2^12 will need to use a 2nd scratch
+  // register to hold the constant. so, a register increment/decrement
+  // may trash rscratch2 and an address increment/decrement trash
+  // rscratch and rscratch2
+
+  void decrementw(Address dst, int value = 1);
+  void decrementw(Register reg, int value = 1);
+
+  void decrement(Register reg, int value = 1);
+  void decrement(Address dst, int value = 1);
+
+  void incrementw(Address dst, int value = 1);
+  void incrementw(Register reg, int value = 1);
+
+  void increment(Register reg, int value = 1);
+  void increment(Address dst, int value = 1);
+
+
+  // Alignment
+  void align(int modulus);
+
+  // Stack frame creation/removal
+  void enter()
+  {
+    stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
+    mov(rfp, sp);
+  }
+  void leave()
+  {
+    mov(sp, rfp);
+    ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
+  }
+
+  // Support for getting the JavaThread pointer (i.e.; a reference to thread-local information)
+  // The pointer will be loaded into the thread register.
+  void get_thread(Register thread);
+
+
+  // Support for VM calls
+  //
+  // It is imperative that all calls into the VM are handled via the call_VM macros.
+  // They make sure that the stack linkage is setup correctly. call_VM's correspond
+  // to ENTRY/ENTRY_X entry points while call_VM_leaf's correspond to LEAF entry points.
+
+
+  void call_VM(Register oop_result,
+               address entry_point,
+               bool check_exceptions = true);
+  void call_VM(Register oop_result,
+               address entry_point,
+               Register arg_1,
+               bool check_exceptions = true);
+  void call_VM(Register oop_result,
+               address entry_point,
+               Register arg_1, Register arg_2,
+               bool check_exceptions = true);
+  void call_VM(Register oop_result,
+               address entry_point,
+               Register arg_1, Register arg_2, Register arg_3,
+               bool check_exceptions = true);
+
+  // Overloadings with last_Java_sp
+  void call_VM(Register oop_result,
+               Register last_java_sp,
+               address entry_point,
+               int number_of_arguments = 0,
+               bool check_exceptions = true);
+  void call_VM(Register oop_result,
+               Register last_java_sp,
+               address entry_point,
+               Register arg_1, bool
+               check_exceptions = true);
+  void call_VM(Register oop_result,
+               Register last_java_sp,
+               address entry_point,
+               Register arg_1, Register arg_2,
+               bool check_exceptions = true);
+  void call_VM(Register oop_result,
+               Register last_java_sp,
+               address entry_point,
+               Register arg_1, Register arg_2, Register arg_3,
+               bool check_exceptions = true);
+
+  void get_vm_result  (Register oop_result, Register thread);
+  void get_vm_result_2(Register metadata_result, Register thread);
+
+  // These always tightly bind to MacroAssembler::call_VM_base
+  // bypassing the virtual implementation
+  void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, int number_of_arguments = 0, bool check_exceptions = true);
+  void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, bool check_exceptions = true);
+  void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, bool check_exceptions = true);
+  void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, bool check_exceptions = true);
+  void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4, bool check_exceptions = true);
+
+  void call_VM_leaf(address entry_point,
+                    int number_of_arguments = 0);
+  void call_VM_leaf(address entry_point,
+                    Register arg_1);
+  void call_VM_leaf(address entry_point,
+                    Register arg_1, Register arg_2);
+  void call_VM_leaf(address entry_point,
+                    Register arg_1, Register arg_2, Register arg_3);
+
+  // These always tightly bind to MacroAssembler::call_VM_leaf_base
+  // bypassing the virtual implementation
+  void super_call_VM_leaf(address entry_point);
+  void super_call_VM_leaf(address entry_point, Register arg_1);
+  void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2);
+  void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3);
+  void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4);
+
+  // last Java Frame (fills frame anchor)
+  void set_last_Java_frame(Register last_java_sp,
+                           Register last_java_fp,
+                           address last_java_pc,
+                           Register scratch);
+
+  void set_last_Java_frame(Register last_java_sp,
+                           Register last_java_fp,
+                           Label &last_java_pc,
+                           Register scratch);
+
+  void set_last_Java_frame(Register last_java_sp,
+                           Register last_java_fp,
+                           Register last_java_pc,
+                           Register scratch);
+
+  void reset_last_Java_frame(Register thread, bool clearfp, bool clear_pc);
+
+  // thread in the default location (r15_thread on 64bit)
+  void reset_last_Java_frame(bool clear_fp, bool clear_pc);
+
+  // Stores
+  void store_check(Register obj);                // store check for obj - register is destroyed afterwards
+  void store_check(Register obj, Address dst);   // same as above, dst is exact store location (reg. is destroyed)
+
+#if INCLUDE_ALL_GCS
+
+  void g1_write_barrier_pre(Register obj,
+                            Register pre_val,
+                            Register thread,
+                            Register tmp,
+                            bool tosca_live,
+                            bool expand_call);
+
+  void g1_write_barrier_post(Register store_addr,
+                             Register new_val,
+                             Register thread,
+                             Register tmp,
+                             Register tmp2);
+
+#endif // INCLUDE_ALL_GCS
+
+  // split store_check(Register obj) to enhance instruction interleaving
+  void store_check_part_1(Register obj);
+  void store_check_part_2(Register obj);
+
+  // oop manipulations
+  void load_klass(Register dst, Register src);
+  void store_klass(Register dst, Register src);
+  void cmp_klass(Register oop, Register trial_klass, Register tmp);
+
+  void load_heap_oop(Register dst, Address src);
+
+  void load_heap_oop_not_null(Register dst, Address src);
+  void store_heap_oop(Address dst, Register src);
+
+  // currently unimplemented
+  // Used for storing NULL. All other oop constants should be
+  // stored using routines that take a jobject.
+  void store_heap_oop_null(Address dst);
+
+  void load_prototype_header(Register dst, Register src);
+
+  void store_klass_gap(Register dst, Register src);
+
+  // This dummy is to prevent a call to store_heap_oop from
+  // converting a zero (like NULL) into a Register by giving
+  // the compiler two choices it can't resolve
+
+  void store_heap_oop(Address dst, void* dummy);
+
+  void encode_heap_oop(Register d, Register s);
+  void encode_heap_oop(Register r) { encode_heap_oop(r, r); }
+  void decode_heap_oop(Register d, Register s);
+  void decode_heap_oop(Register r) { decode_heap_oop(r, r); }
+  void encode_heap_oop_not_null(Register r);
+  void decode_heap_oop_not_null(Register r);
+  void encode_heap_oop_not_null(Register dst, Register src);
+  void decode_heap_oop_not_null(Register dst, Register src);
+
+  void set_narrow_oop(Register dst, jobject obj);
+
+  void encode_klass_not_null(Register r);
+  void decode_klass_not_null(Register r);
+  void encode_klass_not_null(Register dst, Register src);
+  void decode_klass_not_null(Register dst, Register src);
+
+  void set_narrow_klass(Register dst, Klass* k);
+
+  // if heap base register is used - reinit it with the correct value
+  void reinit_heapbase();
+
+  DEBUG_ONLY(void verify_heapbase(const char* msg);)
+
+  void push_CPU_state();
+  void pop_CPU_state() ;
+
+  // Round up to a power of two
+  void round_to(Register reg, int modulus);
+
+  // allocation
+  void eden_allocate(
+    Register obj,                      // result: pointer to object after successful allocation
+    Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
+    int      con_size_in_bytes,        // object size in bytes if   known at compile time
+    Register t1,                       // temp register
+    Label&   slow_case                 // continuation point if fast allocation fails
+  );
+  void tlab_allocate(
+    Register obj,                      // result: pointer to object after successful allocation
+    Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
+    int      con_size_in_bytes,        // object size in bytes if   known at compile time
+    Register t1,                       // temp register
+    Register t2,                       // temp register
+    Label&   slow_case                 // continuation point if fast allocation fails
+  );
+  Register tlab_refill(Label& retry_tlab, Label& try_eden, Label& slow_case); // returns TLS address
+  void verify_tlab();
+
+  void incr_allocated_bytes(Register thread,
+                            Register var_size_in_bytes, int con_size_in_bytes,
+                            Register t1 = noreg);
+
+  // interface method calling
+  void lookup_interface_method(Register recv_klass,
+                               Register intf_klass,
+                               RegisterOrConstant itable_index,
+                               Register method_result,
+                               Register scan_temp,
+                               Label& no_such_interface);
+
+  // virtual method calling
+  // n.b. x86 allows RegisterOrConstant for vtable_index
+  void lookup_virtual_method(Register recv_klass,
+                             RegisterOrConstant vtable_index,
+                             Register method_result);
+
+  // Test sub_klass against super_klass, with fast and slow paths.
+
+  // The fast path produces a tri-state answer: yes / no / maybe-slow.
+  // One of the three labels can be NULL, meaning take the fall-through.
+  // If super_check_offset is -1, the value is loaded up from super_klass.
+  // No registers are killed, except temp_reg.
+  void check_klass_subtype_fast_path(Register sub_klass,
+                                     Register super_klass,
+                                     Register temp_reg,
+                                     Label* L_success,
+                                     Label* L_failure,
+                                     Label* L_slow_path,
+                RegisterOrConstant super_check_offset = RegisterOrConstant(-1));
+
+  // The rest of the type check; must be wired to a corresponding fast path.
+  // It does not repeat the fast path logic, so don't use it standalone.
+  // The temp_reg and temp2_reg can be noreg, if no temps are available.
+  // Updates the sub's secondary super cache as necessary.
+  // If set_cond_codes, condition codes will be Z on success, NZ on failure.
+  void check_klass_subtype_slow_path(Register sub_klass,
+                                     Register super_klass,
+                                     Register temp_reg,
+                                     Register temp2_reg,
+                                     Label* L_success,
+                                     Label* L_failure,
+                                     bool set_cond_codes = false);
+
+  // Simplified, combined version, good for typical uses.
+  // Falls through on failure.
+  void check_klass_subtype(Register sub_klass,
+                           Register super_klass,
+                           Register temp_reg,
+                           Label& L_success);
+
+  Address argument_address(RegisterOrConstant arg_slot, int extra_slot_offset = 0);
+
+
+  // Debugging
+
+  // only if +VerifyOops
+  void verify_oop(Register reg, const char* s = "broken oop");
+  void verify_oop_addr(Address addr, const char * s = "broken oop addr");
+
+// TODO: verify method and klass metadata (compare against vptr?)
+  void _verify_method_ptr(Register reg, const char * msg, const char * file, int line) {}
+  void _verify_klass_ptr(Register reg, const char * msg, const char * file, int line){}
+
+#define verify_method_ptr(reg) _verify_method_ptr(reg, "broken method " #reg, __FILE__, __LINE__)
+#define verify_klass_ptr(reg) _verify_klass_ptr(reg, "broken klass " #reg, __FILE__, __LINE__)
+
+  // only if +VerifyFPU
+  void verify_FPU(int stack_depth, const char* s = "illegal FPU state");
+
+  // prints msg, dumps registers and stops execution
+  void stop(const char* msg);
+
+  // prints msg and continues
+  void warn(const char* msg);
+
+  static void debug64(char* msg, int64_t pc, int64_t regs[]);
+
+  void untested()                                { stop("untested"); }
+
+  void unimplemented(const char* what = "")      { char* b = new char[1024];  jio_snprintf(b, 1024, "unimplemented: %s", what);  stop(b); }
+
+  void should_not_reach_here()                   { stop("should not reach here"); }
+
+  // Stack overflow checking
+  void bang_stack_with_offset(int offset) {
+    // stack grows down, caller passes positive offset
+    assert(offset > 0, "must bang with negative offset");
+    mov(rscratch2, -offset);
+    ldr(zr, Address(sp, rscratch2));
+  }
+
+  // Writes to stack successive pages until offset reached to check for
+  // stack overflow + shadow pages.  Also, clobbers tmp
+  void bang_stack_size(Register size, Register tmp);
+
+  virtual RegisterOrConstant delayed_value_impl(intptr_t* delayed_value_addr,
+                                                Register tmp,
+                                                int offset);
+
+  // Support for serializing memory accesses between threads
+  void serialize_memory(Register thread, Register tmp);
+
+  // Arithmetics
+
+  void addptr(Address dst, int32_t src) {
+    lea(rscratch2, dst);
+    ldr(rscratch1, Address(rscratch2));
+    add(rscratch1, rscratch1, src);
+    str(rscratch1, Address(rscratch2));
+  }
+
+  void cmpptr(Register src1, Address src2);
+
+  void cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
+                  Label &suceed, Label *fail);
+
+  void cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
+                  Label &suceed, Label *fail);
+
+  void atomic_add(Register prev, RegisterOrConstant incr, Register addr);
+  void atomic_addw(Register prev, RegisterOrConstant incr, Register addr);
+
+  void atomic_xchg(Register prev, Register newv, Register addr);
+  void atomic_xchgw(Register prev, Register newv, Register addr);
+
+  void orptr(Address adr, RegisterOrConstant src) {
+    ldr(rscratch2, adr);
+    if (src.is_register())
+      orr(rscratch2, rscratch2, src.as_register());
+    else
+      orr(rscratch2, rscratch2, src.as_constant());
+    str(rscratch2, adr);
+  }
+
+  // Calls
+
+  void trampoline_call(Address entry, CodeBuffer *cbuf = NULL);
+
+  static bool far_branches() {
+    return ReservedCodeCacheSize > branch_range;
+  }
+
+  // Jumps that can reach anywhere in the code cache.
+  // Trashes tmp.
+  void far_call(Address entry, CodeBuffer *cbuf = NULL, Register tmp = rscratch1);
+  void far_jump(Address entry, CodeBuffer *cbuf = NULL, Register tmp = rscratch1);
+
+  static int far_branch_size() {
+    if (far_branches()) {
+      return 3 * 4;  // adrp, add, br
+    } else {
+      return 4;
+    }
+  }
+
+  // Emit the CompiledIC call idiom
+  void ic_call(address entry);
+
+public:
+
+  // Data
+
+  void mov_metadata(Register dst, Metadata* obj);
+  Address allocate_metadata_address(Metadata* obj);
+  Address constant_oop_address(jobject obj);
+
+  void movoop(Register dst, jobject obj, bool immediate = false);
+
+  // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
+  void kernel_crc32(Register crc, Register buf, Register len,
+        Register table0, Register table1, Register table2, Register table3,
+        Register tmp, Register tmp2, Register tmp3);
+
+#undef VIRTUAL
+
+  // Stack push and pop individual 64 bit registers
+  void push(Register src);
+  void pop(Register dst);
+
+  // push all registers onto the stack
+  void pusha();
+  void popa();
+
+  void repne_scan(Register addr, Register value, Register count,
+                  Register scratch);
+  void repne_scanw(Register addr, Register value, Register count,
+                   Register scratch);
+
+  typedef void (MacroAssembler::* add_sub_imm_insn)(Register Rd, Register Rn, unsigned imm);
+  typedef void (MacroAssembler::* add_sub_reg_insn)(Register Rd, Register Rn, Register Rm, enum shift_kind kind, unsigned shift);
+
+  // If a constant does not fit in an immediate field, generate some
+  // number of MOV instructions and then perform the operation
+  void wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
+                             add_sub_imm_insn insn1,
+                             add_sub_reg_insn insn2);
+  // Seperate vsn which sets the flags
+  void wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
+                             add_sub_imm_insn insn1,
+                             add_sub_reg_insn insn2);
+
+#define WRAP(INSN)                                                      \
+  void INSN(Register Rd, Register Rn, unsigned imm) {                   \
+    wrap_add_sub_imm_insn(Rd, Rn, imm, &Assembler::INSN, &Assembler::INSN); \
+  }                                                                     \
+                                                                        \
+  void INSN(Register Rd, Register Rn, Register Rm,                      \
+             enum shift_kind kind, unsigned shift = 0) {                \
+    Assembler::INSN(Rd, Rn, Rm, kind, shift);                           \
+  }                                                                     \
+                                                                        \
+  void INSN(Register Rd, Register Rn, Register Rm) {                    \
+    Assembler::INSN(Rd, Rn, Rm);                                        \
+  }                                                                     \
+                                                                        \
+  void INSN(Register Rd, Register Rn, Register Rm,                      \
+           ext::operation option, int amount = 0) {                     \
+    Assembler::INSN(Rd, Rn, Rm, option, amount);                        \
+  }
+
+  WRAP(add) WRAP(addw) WRAP(sub) WRAP(subw)
+
+#undef WRAP
+#define WRAP(INSN)                                                      \
+  void INSN(Register Rd, Register Rn, unsigned imm) {                   \
+    wrap_adds_subs_imm_insn(Rd, Rn, imm, &Assembler::INSN, &Assembler::INSN); \
+  }                                                                     \
+                                                                        \
+  void INSN(Register Rd, Register Rn, Register Rm,                      \
+             enum shift_kind kind, unsigned shift = 0) {                \
+    Assembler::INSN(Rd, Rn, Rm, kind, shift);                           \
+  }                                                                     \
+                                                                        \
+  void INSN(Register Rd, Register Rn, Register Rm) {                    \
+    Assembler::INSN(Rd, Rn, Rm);                                        \
+  }                                                                     \
+                                                                        \
+  void INSN(Register Rd, Register Rn, Register Rm,                      \
+           ext::operation option, int amount = 0) {                     \
+    Assembler::INSN(Rd, Rn, Rm, option, amount);                        \
+  }
+
+  WRAP(adds) WRAP(addsw) WRAP(subs) WRAP(subsw)
+
+  void add(Register Rd, Register Rn, RegisterOrConstant increment);
+  void addw(Register Rd, Register Rn, RegisterOrConstant increment);
+
+  void adrp(Register reg1, const Address &dest, unsigned long &byte_offset);
+
+  void tableswitch(Register index, jint lowbound, jint highbound,
+                   Label &jumptable, Label &jumptable_end, int stride = 1) {
+    adr(rscratch1, jumptable);
+    subsw(rscratch2, index, lowbound);
+    subsw(zr, rscratch2, highbound - lowbound);
+    br(Assembler::HS, jumptable_end);
+    add(rscratch1, rscratch1, rscratch2,
+        ext::sxtw, exact_log2(stride * Assembler::instruction_size));
+    br(rscratch1);
+  }
+
+  // Form an address from base + offset in Rd.  Rd may or may not
+  // actually be used: you must use the Address that is returned.  It
+  // is up to you to ensure that the shift provided matches the size
+  // of your data.
+  Address form_address(Register Rd, Register base, long byte_offset, int shift);
+
+  // Prolog generator routines to support switch between x86 code and
+  // generated ARM code
+
+  // routine to generate an x86 prolog for a stub function which
+  // bootstraps into the generated ARM code which directly follows the
+  // stub
+  //
+
+  public:
+  // enum used for aarch64--x86 linkage to define return type of x86 function
+  enum ret_type { ret_type_void, ret_type_integral, ret_type_float, ret_type_double};
+
+#ifdef BUILTIN_SIM
+  void c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type, address *prolog_ptr = NULL);
+#else
+  void c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type) { }
+#endif
+
+  // special version of call_VM_leaf_base needed for aarch64 simulator
+  // where we need to specify both the gp and fp arg counts and the
+  // return type so that the linkage routine from aarch64 to x86 and
+  // back knows which aarch64 registers to copy to x86 registers and
+  // which x86 result register to copy back to an aarch64 register
+
+  void call_VM_leaf_base1(
+    address  entry_point,             // the entry point
+    int      number_of_gp_arguments,  // the number of gp reg arguments to pass
+    int      number_of_fp_arguments,  // the number of fp reg arguments to pass
+    ret_type type,                    // the return type for the call
+    Label*   retaddr = NULL
+  );
+
+  void ldr_constant(Register dest, const Address &const_addr) {
+    if (NearCpool) {
+      ldr(dest, const_addr);
+    } else {
+      unsigned long offset;
+      adrp(dest, InternalAddress(const_addr.target()), offset);
+      ldr(dest, Address(dest, offset));
+    }
+  }
+
+  address read_polling_page(Register r, address page, relocInfo::relocType rtype);
+  address read_polling_page(Register r, relocInfo::relocType rtype);
+
+  // Used by aarch64.ad to control code generation
+  static bool use_acq_rel_for_volatile_fields();
+
+  // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
+  void update_byte_crc32(Register crc, Register val, Register table);
+  void update_word_crc32(Register crc, Register v, Register tmp,
+        Register table0, Register table1, Register table2, Register table3,
+        bool upper = false);
+
+  void string_compare(Register str1, Register str2,
+                      Register cnt1, Register cnt2, Register result,
+                      Register tmp1);
+  void string_equals(Register str1, Register str2,
+                     Register cnt, Register result,
+                     Register tmp1);
+  void char_arrays_equals(Register ary1, Register ary2,
+                          Register result, Register tmp1);
+  void encode_iso_array(Register src, Register dst,
+                        Register len, Register result,
+                        FloatRegister Vtmp1, FloatRegister Vtmp2,
+                        FloatRegister Vtmp3, FloatRegister Vtmp4);
+  void string_indexof(Register str1, Register str2,
+                      Register cnt1, Register cnt2,
+                      Register tmp1, Register tmp2,
+                      Register tmp3, Register tmp4,
+                      int int_cnt1, Register result);
+
+  // ISB may be needed because of a safepoint
+  void maybe_isb() { isb(); }
+};
+
+// Used by aarch64.ad to control code generation
+#define treat_as_volatile(MEM_NODE)                                     \
+  (MacroAssembler::use_acq_rel_for_volatile_fields() ? (MEM_NODE)->is_volatile() : false)
+
+#ifdef ASSERT
+inline bool AbstractAssembler::pd_check_instruction_mark() { return false; }
+#endif
+
+/**
+ * class SkipIfEqual:
+ *
+ * Instantiating this class will result in assembly code being output that will
+ * jump around any code emitted between the creation of the instance and it's
+ * automatic destruction at the end of a scope block, depending on the value of
+ * the flag passed to the constructor, which will be checked at run-time.
+ */
+class SkipIfEqual {
+ private:
+  MacroAssembler* _masm;
+  Label _label;
+
+ public:
+   SkipIfEqual(MacroAssembler*, const bool* flag_addr, bool value);
+   ~SkipIfEqual();
+};
+
+struct tableswitch {
+  Register _reg;
+  int _insn_index; jint _first_key; jint _last_key;
+  Label _after;
+  Label _branches;
+};
+
+#endif // CPU_AARCH64_VM_MACROASSEMBLER_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.inline.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_MACROASSEMBLER_AARCH64_INLINE_HPP
+#define CPU_AARCH64_VM_MACROASSEMBLER_AARCH64_INLINE_HPP
+
+#include "asm/assembler.hpp"
+
+#ifndef PRODUCT
+
+#endif // ndef PRODUCT
+
+#endif // CPU_AARCH64_VM_MACROASSEMBLER_AARCH64_INLINE_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/metaspaceShared_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2004, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "memory/metaspaceShared.hpp"
+
+// Generate the self-patching vtable method:
+//
+// This method will be called (as any other Klass virtual method) with
+// the Klass itself as the first argument.  Example:
+//
+//      oop obj;
+//      int size = obj->klass()->oop_size(this);
+//
+// for which the virtual method call is Klass::oop_size();
+//
+// The dummy method is called with the Klass object as the first
+// operand, and an object as the second argument.
+//
+
+//=====================================================================
+
+// All of the dummy methods in the vtable are essentially identical,
+// differing only by an ordinal constant, and they bear no relationship
+// to the original method which the caller intended. Also, there needs
+// to be 'vtbl_list_size' instances of the vtable in order to
+// differentiate between the 'vtable_list_size' original Klass objects.
+
+#define __ masm->
+
+extern "C" {
+  void aarch64_prolog(void);
+}
+
+void MetaspaceShared::generate_vtable_methods(void** vtbl_list,
+                                                   void** vtable,
+                                                   char** md_top,
+                                                   char* md_end,
+                                                   char** mc_top,
+                                                   char* mc_end) {
+
+#ifdef BUILTIN_SIM
+  // Write a dummy word to the writable shared metaspace.
+  // MetaspaceShared::initialize_shared_spaces will fill it with the
+  // address of aarch64_prolog().
+  address *prolog_ptr = (address*)*md_top;
+  *(intptr_t *)(*md_top) = (intptr_t)0;
+  (*md_top) += sizeof(intptr_t);
+#endif
+
+  intptr_t vtable_bytes = (num_virtuals * vtbl_list_size) * sizeof(void*);
+  *(intptr_t *)(*md_top) = vtable_bytes;
+  *md_top += sizeof(intptr_t);
+  void** dummy_vtable = (void**)*md_top;
+  *vtable = dummy_vtable;
+  *md_top += vtable_bytes;
+
+  // Get ready to generate dummy methods.
+
+  CodeBuffer cb((unsigned char*)*mc_top, mc_end - *mc_top);
+  MacroAssembler* masm = new MacroAssembler(&cb);
+
+  Label common_code;
+  for (int i = 0; i < vtbl_list_size; ++i) {
+    for (int j = 0; j < num_virtuals; ++j) {
+      dummy_vtable[num_virtuals * i + j] = (void*)masm->pc();
+
+      // We're called directly from C code.
+#ifdef BUILTIN_SIM
+      __ c_stub_prolog(8, 0, MacroAssembler::ret_type_integral, prolog_ptr);
+#endif
+      // Load rscratch1 with a value indicating vtable/offset pair.
+      // -- bits[ 7..0]  (8 bits) which virtual method in table?
+      // -- bits[12..8]  (5 bits) which virtual method table?
+      __ mov(rscratch1, (i << 8) + j);
+      __ b(common_code);
+    }
+  }
+
+  __ bind(common_code);
+
+  Register tmp0 = r10, tmp1 = r11;       // AAPCS64 temporary registers
+  __ enter();
+  __ lsr(tmp0, rscratch1, 8);            // isolate vtable identifier.
+  __ mov(tmp1, (address)vtbl_list);      // address of list of vtable pointers.
+  __ ldr(tmp1, Address(tmp1, tmp0, Address::lsl(LogBytesPerWord))); // get correct vtable pointer.
+  __ str(tmp1, Address(c_rarg0));        // update vtable pointer in obj.
+  __ add(rscratch1, tmp1, rscratch1, ext::uxtb, LogBytesPerWord); // address of real method pointer.
+  __ ldr(rscratch1, Address(rscratch1)); // get real method pointer.
+  __ blrt(rscratch1, 8, 0, 1);           // jump to the real method.
+  __ leave();
+  __ ret(lr);
+
+  *mc_top = (char*)__ pc();
+}
+
+#ifdef BUILTIN_SIM
+void MetaspaceShared::relocate_vtbl_list(char **buffer) {
+  void **sim_entry = (void**)*buffer;
+  *sim_entry = (void*)aarch64_prolog;
+  *buffer += sizeof(intptr_t);
+}
+#endif

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/methodHandles_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,443 @@
+/*
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "interpreter/interpreter.hpp"
+#include "interpreter/interpreterRuntime.hpp"
+#include "memory/allocation.inline.hpp"
+#include "prims/methodHandles.hpp"
+
+#define __ _masm->
+
+#ifdef PRODUCT
+#define BLOCK_COMMENT(str) /* nothing */
+#else
+#define BLOCK_COMMENT(str) __ block_comment(str)
+#endif
+
+#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
+
+void MethodHandles::load_klass_from_Class(MacroAssembler* _masm, Register klass_reg) {
+  if (VerifyMethodHandles)
+    verify_klass(_masm, klass_reg, SystemDictionary::WK_KLASS_ENUM_NAME(java_lang_Class),
+                 "MH argument is a Class");
+  __ ldr(klass_reg, Address(klass_reg, java_lang_Class::klass_offset_in_bytes()));
+}
+
+#ifdef ASSERT
+static int check_nonzero(const char* xname, int x) {
+  assert(x != 0, err_msg("%s should be nonzero", xname));
+  return x;
+}
+#define NONZERO(x) check_nonzero(#x, x)
+#else //ASSERT
+#define NONZERO(x) (x)
+#endif //PRODUCT
+
+#ifdef ASSERT
+void MethodHandles::verify_klass(MacroAssembler* _masm,
+                                 Register obj, SystemDictionary::WKID klass_id,
+                                 const char* error_message) {
+  Klass** klass_addr = SystemDictionary::well_known_klass_addr(klass_id);
+  KlassHandle klass = SystemDictionary::well_known_klass(klass_id);
+  Register temp = rscratch2;
+  Register temp2 = rscratch1; // used by MacroAssembler::cmpptr
+  Label L_ok, L_bad;
+  BLOCK_COMMENT("verify_klass {");
+  __ verify_oop(obj);
+  __ cbz(obj, L_bad);
+  __ push(RegSet::of(temp, temp2), sp);
+  __ load_klass(temp, obj);
+  __ cmpptr(temp, ExternalAddress((address) klass_addr));
+  __ br(Assembler::EQ, L_ok);
+  intptr_t super_check_offset = klass->super_check_offset();
+  __ ldr(temp, Address(temp, super_check_offset));
+  __ cmpptr(temp, ExternalAddress((address) klass_addr));
+  __ br(Assembler::EQ, L_ok);
+  __ pop(RegSet::of(temp, temp2), sp);
+  __ bind(L_bad);
+  __ stop(error_message);
+  __ BIND(L_ok);
+  __ pop(RegSet::of(temp, temp2), sp);
+  BLOCK_COMMENT("} verify_klass");
+}
+
+void MethodHandles::verify_ref_kind(MacroAssembler* _masm, int ref_kind, Register member_reg, Register temp) {  }
+
+#endif //ASSERT
+
+void MethodHandles::jump_from_method_handle(MacroAssembler* _masm, Register method, Register temp,
+                                            bool for_compiler_entry) {
+  assert(method == rmethod, "interpreter calling convention");
+  Label L_no_such_method;
+  __ cbz(rmethod, L_no_such_method);
+  __ verify_method_ptr(method);
+
+  if (!for_compiler_entry && JvmtiExport::can_post_interpreter_events()) {
+    Label run_compiled_code;
+    // JVMTI events, such as single-stepping, are implemented partly by avoiding running
+    // compiled code in threads for which the event is enabled.  Check here for
+    // interp_only_mode if these events CAN be enabled.
+
+    __ ldrb(rscratch1, Address(rthread, JavaThread::interp_only_mode_offset()));
+    __ cbnz(rscratch1, run_compiled_code);
+    __ ldr(rscratch1, Address(method, Method::interpreter_entry_offset()));
+    __ br(rscratch1);
+    __ BIND(run_compiled_code);
+  }
+
+  const ByteSize entry_offset = for_compiler_entry ? Method::from_compiled_offset() :
+                                                     Method::from_interpreted_offset();
+  __ ldr(rscratch1,Address(method, entry_offset));
+  __ br(rscratch1);
+  __ bind(L_no_such_method);
+  __ far_jump(RuntimeAddress(StubRoutines::throw_AbstractMethodError_entry()));
+}
+
+void MethodHandles::jump_to_lambda_form(MacroAssembler* _masm,
+                                        Register recv, Register method_temp,
+                                        Register temp2,
+                                        bool for_compiler_entry) {
+  BLOCK_COMMENT("jump_to_lambda_form {");
+  // This is the initial entry point of a lazy method handle.
+  // After type checking, it picks up the invoker from the LambdaForm.
+  assert_different_registers(recv, method_temp, temp2);
+  assert(recv != noreg, "required register");
+  assert(method_temp == rmethod, "required register for loading method");
+
+  //NOT_PRODUCT({ FlagSetting fs(TraceMethodHandles, true); trace_method_handle(_masm, "LZMH"); });
+
+  // Load the invoker, as MH -> MH.form -> LF.vmentry
+  __ verify_oop(recv);
+  __ load_heap_oop(method_temp, Address(recv, NONZERO(java_lang_invoke_MethodHandle::form_offset_in_bytes())));
+  __ verify_oop(method_temp);
+  __ load_heap_oop(method_temp, Address(method_temp, NONZERO(java_lang_invoke_LambdaForm::vmentry_offset_in_bytes())));
+  __ verify_oop(method_temp);
+  // the following assumes that a Method* is normally compressed in the vmtarget field:
+  __ ldr(method_temp, Address(method_temp, NONZERO(java_lang_invoke_MemberName::vmtarget_offset_in_bytes())));
+
+  if (VerifyMethodHandles && !for_compiler_entry) {
+    // make sure recv is already on stack
+    __ ldr(temp2, Address(method_temp, Method::const_offset()));
+    __ load_sized_value(temp2,
+                        Address(temp2, ConstMethod::size_of_parameters_offset()),
+                        sizeof(u2), /*is_signed*/ false);
+    // assert(sizeof(u2) == sizeof(Method::_size_of_parameters), "");
+    Label L;
+    __ ldr(rscratch1, __ argument_address(temp2, -1));
+    __ cmp(recv, rscratch1);
+    __ br(Assembler::EQ, L);
+    __ ldr(r0, __ argument_address(temp2, -1));
+    __ hlt(0);
+    __ BIND(L);
+  }
+
+  jump_from_method_handle(_masm, method_temp, temp2, for_compiler_entry);
+  BLOCK_COMMENT("} jump_to_lambda_form");
+}
+
+// Code generation
+address MethodHandles::generate_method_handle_interpreter_entry(MacroAssembler* _masm,
+                                                                vmIntrinsics::ID iid) {
+  const bool not_for_compiler_entry = false;  // this is the interpreter entry
+  assert(is_signature_polymorphic(iid), "expected invoke iid");
+  if (iid == vmIntrinsics::_invokeGeneric ||
+      iid == vmIntrinsics::_compiledLambdaForm) {
+    // Perhaps surprisingly, the symbolic references visible to Java are not directly used.
+    // They are linked to Java-generated adapters via MethodHandleNatives.linkMethod.
+    // They all allow an appendix argument.
+    __ hlt(0);           // empty stubs make SG sick
+    return NULL;
+  }
+
+  // r13: sender SP (must preserve; see prepare_to_jump_from_interpreted)
+  // rmethod: Method*
+  // r3: argument locator (parameter slot count, added to rsp)
+  // r1: used as temp to hold mh or receiver
+  // r0, r11: garbage temps, blown away
+  Register argp   = r3;   // argument list ptr, live on error paths
+  Register temp   = r0;
+  Register mh     = r1;   // MH receiver; dies quickly and is recycled
+
+  // here's where control starts out:
+  __ align(CodeEntryAlignment);
+  address entry_point = __ pc();
+
+  if (VerifyMethodHandles) {
+    Label L;
+    BLOCK_COMMENT("verify_intrinsic_id {");
+    __ ldrb(rscratch1, Address(rmethod, Method::intrinsic_id_offset_in_bytes()));
+    __ cmp(rscratch1, (int) iid);
+    __ br(Assembler::EQ, L);
+    if (iid == vmIntrinsics::_linkToVirtual ||
+        iid == vmIntrinsics::_linkToSpecial) {
+      // could do this for all kinds, but would explode assembly code size
+      trace_method_handle(_masm, "bad Method*::intrinsic_id");
+    }
+    __ hlt(0);
+    __ bind(L);
+    BLOCK_COMMENT("} verify_intrinsic_id");
+  }
+
+  // First task:  Find out how big the argument list is.
+  Address r3_first_arg_addr;
+  int ref_kind = signature_polymorphic_intrinsic_ref_kind(iid);
+  assert(ref_kind != 0 || iid == vmIntrinsics::_invokeBasic, "must be _invokeBasic or a linkTo intrinsic");
+  if (ref_kind == 0 || MethodHandles::ref_kind_has_receiver(ref_kind)) {
+    __ ldr(argp, Address(rmethod, Method::const_offset()));
+    __ load_sized_value(argp,
+                        Address(argp, ConstMethod::size_of_parameters_offset()),
+                        sizeof(u2), /*is_signed*/ false);
+    // assert(sizeof(u2) == sizeof(Method::_size_of_parameters), "");
+    r3_first_arg_addr = __ argument_address(argp, -1);
+  } else {
+    DEBUG_ONLY(argp = noreg);
+  }
+
+  if (!is_signature_polymorphic_static(iid)) {
+    __ ldr(mh, r3_first_arg_addr);
+    DEBUG_ONLY(argp = noreg);
+  }
+
+  // r3_first_arg_addr is live!
+
+  trace_method_handle_interpreter_entry(_masm, iid);
+  if (iid == vmIntrinsics::_invokeBasic) {
+    generate_method_handle_dispatch(_masm, iid, mh, noreg, not_for_compiler_entry);
+
+  } else {
+    // Adjust argument list by popping the trailing MemberName argument.
+    Register recv = noreg;
+    if (MethodHandles::ref_kind_has_receiver(ref_kind)) {
+      // Load the receiver (not the MH; the actual MemberName's receiver) up from the interpreter stack.
+      __ ldr(recv = r2, r3_first_arg_addr);
+    }
+    DEBUG_ONLY(argp = noreg);
+    Register rmember = rmethod;  // MemberName ptr; incoming method ptr is dead now
+    __ pop(rmember);             // extract last argument
+    generate_method_handle_dispatch(_masm, iid, recv, rmember, not_for_compiler_entry);
+  }
+
+  return entry_point;
+}
+
+
+void MethodHandles::generate_method_handle_dispatch(MacroAssembler* _masm,
+                                                    vmIntrinsics::ID iid,
+                                                    Register receiver_reg,
+                                                    Register member_reg,
+                                                    bool for_compiler_entry) {
+  assert(is_signature_polymorphic(iid), "expected invoke iid");
+  // temps used in this code are not used in *either* compiled or interpreted calling sequences
+  Register temp1 = r10;
+  Register temp2 = r11;
+  Register temp3 = r14;  // r13 is live by this point: it contains the sender SP
+  if (for_compiler_entry) {
+    assert(receiver_reg == (iid == vmIntrinsics::_linkToStatic ? noreg : j_rarg0), "only valid assignment");
+    assert_different_registers(temp1,        j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5, j_rarg6, j_rarg7);
+    assert_different_registers(temp2,        j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5, j_rarg6, j_rarg7);
+    assert_different_registers(temp3,        j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5, j_rarg6, j_rarg7);
+  }
+
+  assert_different_registers(temp1, temp2, temp3, receiver_reg);
+  assert_different_registers(temp1, temp2, temp3, member_reg);
+
+  if (iid == vmIntrinsics::_invokeBasic) {
+    // indirect through MH.form.vmentry.vmtarget
+    jump_to_lambda_form(_masm, receiver_reg, rmethod, temp1, for_compiler_entry);
+
+  } else {
+    // The method is a member invoker used by direct method handles.
+    if (VerifyMethodHandles) {
+      // make sure the trailing argument really is a MemberName (caller responsibility)
+      verify_klass(_masm, member_reg, SystemDictionary::WK_KLASS_ENUM_NAME(java_lang_invoke_MemberName),
+                   "MemberName required for invokeVirtual etc.");
+    }
+
+    Address member_clazz(    member_reg, NONZERO(java_lang_invoke_MemberName::clazz_offset_in_bytes()));
+    Address member_vmindex(  member_reg, NONZERO(java_lang_invoke_MemberName::vmindex_offset_in_bytes()));
+    Address member_vmtarget( member_reg, NONZERO(java_lang_invoke_MemberName::vmtarget_offset_in_bytes()));
+
+    Register temp1_recv_klass = temp1;
+    if (iid != vmIntrinsics::_linkToStatic) {
+      __ verify_oop(receiver_reg);
+      if (iid == vmIntrinsics::_linkToSpecial) {
+        // Don't actually load the klass; just null-check the receiver.
+        __ null_check(receiver_reg);
+      } else {
+        // load receiver klass itself
+        __ null_check(receiver_reg, oopDesc::klass_offset_in_bytes());
+        __ load_klass(temp1_recv_klass, receiver_reg);
+        __ verify_klass_ptr(temp1_recv_klass);
+      }
+      BLOCK_COMMENT("check_receiver {");
+      // The receiver for the MemberName must be in receiver_reg.
+      // Check the receiver against the MemberName.clazz
+      if (VerifyMethodHandles && iid == vmIntrinsics::_linkToSpecial) {
+        // Did not load it above...
+        __ load_klass(temp1_recv_klass, receiver_reg);
+        __ verify_klass_ptr(temp1_recv_klass);
+      }
+      if (VerifyMethodHandles && iid != vmIntrinsics::_linkToInterface) {
+        Label L_ok;
+        Register temp2_defc = temp2;
+        __ load_heap_oop(temp2_defc, member_clazz);
+        load_klass_from_Class(_masm, temp2_defc);
+        __ verify_klass_ptr(temp2_defc);
+        __ check_klass_subtype(temp1_recv_klass, temp2_defc, temp3, L_ok);
+        // If we get here, the type check failed!
+        __ hlt(0);
+        // __ STOP("receiver class disagrees with MemberName.clazz");
+        __ bind(L_ok);
+      }
+      BLOCK_COMMENT("} check_receiver");
+    }
+    if (iid == vmIntrinsics::_linkToSpecial ||
+        iid == vmIntrinsics::_linkToStatic) {
+      DEBUG_ONLY(temp1_recv_klass = noreg);  // these guys didn't load the recv_klass
+    }
+
+    // Live registers at this point:
+    //  member_reg - MemberName that was the trailing argument
+    //  temp1_recv_klass - klass of stacked receiver, if needed
+    //  r13 - interpreter linkage (if interpreted)  ??? FIXME
+    //  r1 ... r0 - compiler arguments (if compiled)
+
+    Label L_incompatible_class_change_error;
+    switch (iid) {
+    case vmIntrinsics::_linkToSpecial:
+      if (VerifyMethodHandles) {
+        verify_ref_kind(_masm, JVM_REF_invokeSpecial, member_reg, temp3);
+      }
+      __ ldr(rmethod, member_vmtarget);
+      break;
+
+    case vmIntrinsics::_linkToStatic:
+      if (VerifyMethodHandles) {
+        verify_ref_kind(_masm, JVM_REF_invokeStatic, member_reg, temp3);
+      }
+      __ ldr(rmethod, member_vmtarget);
+      break;
+
+    case vmIntrinsics::_linkToVirtual:
+    {
+      // same as TemplateTable::invokevirtual,
+      // minus the CP setup and profiling:
+
+      if (VerifyMethodHandles) {
+        verify_ref_kind(_masm, JVM_REF_invokeVirtual, member_reg, temp3);
+      }
+
+      // pick out the vtable index from the MemberName, and then we can discard it:
+      Register temp2_index = temp2;
+      __ ldr(temp2_index, member_vmindex);
+
+      if (VerifyMethodHandles) {
+        Label L_index_ok;
+        __ cmpw(temp2_index, 0U);
+        __ br(Assembler::GE, L_index_ok);
+        __ hlt(0);
+        __ BIND(L_index_ok);
+      }
+
+      // Note:  The verifier invariants allow us to ignore MemberName.clazz and vmtarget
+      // at this point.  And VerifyMethodHandles has already checked clazz, if needed.
+
+      // get target Method* & entry point
+      __ lookup_virtual_method(temp1_recv_klass, temp2_index, rmethod);
+      break;
+    }
+
+    case vmIntrinsics::_linkToInterface:
+    {
+      // same as TemplateTable::invokeinterface
+      // (minus the CP setup and profiling, with different argument motion)
+      if (VerifyMethodHandles) {
+        verify_ref_kind(_masm, JVM_REF_invokeInterface, member_reg, temp3);
+      }
+
+      Register temp3_intf = temp3;
+      __ load_heap_oop(temp3_intf, member_clazz);
+      load_klass_from_Class(_masm, temp3_intf);
+      __ verify_klass_ptr(temp3_intf);
+
+      Register rindex = rmethod;
+      __ ldr(rindex, member_vmindex);
+      if (VerifyMethodHandles) {
+        Label L;
+        __ cmpw(rindex, 0U);
+        __ br(Assembler::GE, L);
+        __ hlt(0);
+        __ bind(L);
+      }
+
+      // given intf, index, and recv klass, dispatch to the implementation method
+      __ lookup_interface_method(temp1_recv_klass, temp3_intf,
+                                 // note: next two args must be the same:
+                                 rindex, rmethod,
+                                 temp2,
+                                 L_incompatible_class_change_error);
+      break;
+    }
+
+    default:
+      fatal(err_msg_res("unexpected intrinsic %d: %s", iid, vmIntrinsics::name_at(iid)));
+      break;
+    }
+
+    // live at this point:  rmethod, r13 (if interpreted)
+
+    // After figuring out which concrete method to call, jump into it.
+    // Note that this works in the interpreter with no data motion.
+    // But the compiled version will require that r2_recv be shifted out.
+    __ verify_method_ptr(rmethod);
+    jump_from_method_handle(_masm, rmethod, temp1, for_compiler_entry);
+    if (iid == vmIntrinsics::_linkToInterface) {
+      __ bind(L_incompatible_class_change_error);
+      __ far_jump(RuntimeAddress(StubRoutines::throw_IncompatibleClassChangeError_entry()));
+    }
+  }
+}
+
+#ifndef PRODUCT
+void trace_method_handle_stub(const char* adaptername,
+                              oop mh,
+                              intptr_t* saved_regs,
+                              intptr_t* entry_sp) {  }
+
+// The stub wraps the arguments in a struct on the stack to avoid
+// dealing with the different calling conventions for passing 6
+// arguments.
+struct MethodHandleStubArguments {
+  const char* adaptername;
+  oopDesc* mh;
+  intptr_t* saved_regs;
+  intptr_t* entry_sp;
+};
+void trace_method_handle_stub_wrapper(MethodHandleStubArguments* args) {  }
+
+void MethodHandles::trace_method_handle(MacroAssembler* _masm, const char* adaptername) {  }
+#endif //PRODUCT

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/methodHandles_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+// Platform-specific definitions for method handles.
+// These definitions are inlined into class MethodHandles.
+
+// Adapters
+enum /* platform_dependent_constants */ {
+  adapter_code_size = 32000 DEBUG_ONLY(+ 120000)
+};
+
+public:
+
+  static void load_klass_from_Class(MacroAssembler* _masm, Register klass_reg);
+
+  static void verify_klass(MacroAssembler* _masm,
+                           Register obj, SystemDictionary::WKID klass_id,
+                           const char* error_message = "wrong klass") NOT_DEBUG_RETURN;
+
+  static void verify_method_handle(MacroAssembler* _masm, Register mh_reg) {
+    verify_klass(_masm, mh_reg, SystemDictionary::WK_KLASS_ENUM_NAME(java_lang_invoke_MethodHandle),
+                 "reference is a MH");
+  }
+
+  static void verify_ref_kind(MacroAssembler* _masm, int ref_kind, Register member_reg, Register temp) NOT_DEBUG_RETURN;
+
+  // Similar to InterpreterMacroAssembler::jump_from_interpreted.
+  // Takes care of special dispatch from single stepping too.
+  static void jump_from_method_handle(MacroAssembler* _masm, Register method, Register temp,
+                                      bool for_compiler_entry);
+
+  static void jump_to_lambda_form(MacroAssembler* _masm,
+                                  Register recv, Register method_temp,
+                                  Register temp2,
+                                  bool for_compiler_entry);
+
+  static Register saved_last_sp_register() {
+    // Should be in sharedRuntime, not here.
+    return noreg;
+  }

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/nativeInst_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,324 @@
+/*
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "memory/resourceArea.hpp"
+#include "nativeInst_aarch64.hpp"
+#include "oops/oop.inline.hpp"
+#include "runtime/handles.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "utilities/ostream.hpp"
+#ifdef COMPILER1
+#include "c1/c1_Runtime1.hpp"
+#endif
+
+void NativeCall::verify() { ; }
+
+address NativeCall::destination() const {
+  address addr = (address)this;
+  address destination = instruction_address() + displacement();
+
+  // Do we use a trampoline stub for this call?
+  CodeBlob* cb = CodeCache::find_blob_unsafe(addr);   // Else we get assertion if nmethod is zombie.
+  assert(cb && cb->is_nmethod(), "sanity");
+  nmethod *nm = (nmethod *)cb;
+  if (nm->stub_contains(destination) && is_NativeCallTrampolineStub_at(destination)) {
+    // Yes we do, so get the destination from the trampoline stub.
+    const address trampoline_stub_addr = destination;
+    destination = nativeCallTrampolineStub_at(trampoline_stub_addr)->destination();
+  }
+
+  return destination;
+}
+
+// Similar to replace_mt_safe, but just changes the destination. The
+// important thing is that free-running threads are able to execute this
+// call instruction at all times.
+//
+// Used in the runtime linkage of calls; see class CompiledIC.
+//
+// Add parameter assert_lock to switch off assertion
+// during code generation, where no patching lock is needed.
+void NativeCall::set_destination_mt_safe(address dest, bool assert_lock) {
+  assert(!assert_lock ||
+         (Patching_lock->is_locked() || SafepointSynchronize::is_at_safepoint()),
+         "concurrent code patching");
+
+  ResourceMark rm;
+  int code_size = NativeInstruction::instruction_size;
+  address addr_call = addr_at(0);
+  assert(NativeCall::is_call_at(addr_call), "unexpected code at call site");
+
+  // Patch the constant in the call's trampoline stub.
+  address trampoline_stub_addr = get_trampoline();
+  if (trampoline_stub_addr != NULL) {
+    assert (! is_NativeCallTrampolineStub_at(dest), "chained trampolines");
+    nativeCallTrampolineStub_at(trampoline_stub_addr)->set_destination(dest);
+  }
+
+  // Patch the call.
+  if (Assembler::reachable_from_branch_at(addr_call, dest)) {
+    set_destination(dest);
+  } else {
+    assert (trampoline_stub_addr != NULL, "we need a trampoline");
+    set_destination(trampoline_stub_addr);
+  }
+
+  ICache::invalidate_range(addr_call, instruction_size);
+}
+
+address NativeCall::get_trampoline() {
+  address call_addr = addr_at(0);
+
+  CodeBlob *code = CodeCache::find_blob(call_addr);
+  assert(code != NULL, "Could not find the containing code blob");
+
+  address bl_destination
+    = MacroAssembler::pd_call_destination(call_addr);
+  if (code->content_contains(bl_destination) &&
+      is_NativeCallTrampolineStub_at(bl_destination))
+    return bl_destination;
+
+  // If the codeBlob is not a nmethod, this is because we get here from the
+  // CodeBlob constructor, which is called within the nmethod constructor.
+  return trampoline_stub_Relocation::get_trampoline_for(call_addr, (nmethod*)code);
+}
+
+// Inserts a native call instruction at a given pc
+void NativeCall::insert(address code_pos, address entry) { Unimplemented(); }
+
+//-------------------------------------------------------------------
+
+void NativeMovConstReg::verify() {
+  // make sure code pattern is actually mov reg64, imm64 instructions
+}
+
+
+intptr_t NativeMovConstReg::data() const {
+  // das(uint64_t(instruction_address()),2);
+  address addr = MacroAssembler::target_addr_for_insn(instruction_address());
+  if (maybe_cpool_ref(instruction_address())) {
+    return *(intptr_t*)addr;
+  } else {
+    return (intptr_t)addr;
+  }
+}
+
+void NativeMovConstReg::set_data(intptr_t x) {
+  if (maybe_cpool_ref(instruction_address())) {
+    address addr = MacroAssembler::target_addr_for_insn(instruction_address());
+    *(intptr_t*)addr = x;
+  } else {
+    MacroAssembler::pd_patch_instruction(instruction_address(), (address)x);
+    ICache::invalidate_range(instruction_address(), instruction_size);
+  }
+};
+
+void NativeMovConstReg::print() {
+  tty->print_cr(PTR_FORMAT ": mov reg, " INTPTR_FORMAT,
+                p2i(instruction_address()), data());
+}
+
+//-------------------------------------------------------------------
+
+address NativeMovRegMem::instruction_address() const      { return addr_at(instruction_offset); }
+
+int NativeMovRegMem::offset() const  {
+  address pc = instruction_address();
+  unsigned insn = *(unsigned*)pc;
+  if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
+    address addr = MacroAssembler::target_addr_for_insn(pc);
+    return *addr;
+  } else {
+    return (int)(intptr_t)MacroAssembler::target_addr_for_insn(instruction_address());
+  }
+}
+
+void NativeMovRegMem::set_offset(int x) {
+  address pc = instruction_address();
+  unsigned insn = *(unsigned*)pc;
+  if (maybe_cpool_ref(pc)) {
+    address addr = MacroAssembler::target_addr_for_insn(pc);
+    *(long*)addr = x;
+  } else {
+    MacroAssembler::pd_patch_instruction(pc, (address)intptr_t(x));
+    ICache::invalidate_range(instruction_address(), instruction_size);
+  }
+}
+
+void NativeMovRegMem::verify() {
+#ifdef ASSERT
+  address dest = MacroAssembler::target_addr_for_insn(instruction_address());
+#endif
+}
+
+//--------------------------------------------------------------------------------
+
+void NativeJump::verify() { ; }
+
+
+void NativeJump::check_verified_entry_alignment(address entry, address verified_entry) {
+}
+
+
+address NativeJump::jump_destination() const          {
+  address dest = MacroAssembler::target_addr_for_insn(instruction_address());
+
+  // We use jump to self as the unresolved address which the inline
+  // cache code (and relocs) know about
+
+  // return -1 if jump to self
+  dest = (dest == (address) this) ? (address) -1 : dest;
+  return dest;
+}
+
+void NativeJump::set_jump_destination(address dest) {
+  // We use jump to self as the unresolved address which the inline
+  // cache code (and relocs) know about
+  if (dest == (address) -1)
+    dest = instruction_address();
+
+  MacroAssembler::pd_patch_instruction(instruction_address(), dest);
+  ICache::invalidate_range(instruction_address(), instruction_size);
+};
+
+//-------------------------------------------------------------------
+
+bool NativeInstruction::is_safepoint_poll() {
+  // a safepoint_poll is implemented in two steps as either
+  //
+  // adrp(reg, polling_page);
+  // ldr(zr, [reg, #offset]);
+  //
+  // or
+  //
+  // mov(reg, polling_page);
+  // ldr(zr, [reg, #offset]);
+  //
+  // however, we cannot rely on the polling page address load always
+  // directly preceding the read from the page. C1 does that but C2
+  // has to do the load and read as two independent instruction
+  // generation steps. that's because with a single macro sequence the
+  // generic C2 code can only add the oop map before the mov/adrp and
+  // the trap handler expects an oop map to be associated with the
+  // load. with the load scheuled as a prior step the oop map goes
+  // where it is needed.
+  //
+  // so all we can do here is check that marked instruction is a load
+  // word to zr
+  return is_ldrw_to_zr(address(this));
+}
+
+bool NativeInstruction::is_adrp_at(address instr) {
+  unsigned insn = *(unsigned*)instr;
+  return (Instruction_aarch64::extract(insn, 31, 24) & 0b10011111) == 0b10010000;
+}
+
+bool NativeInstruction::is_ldr_literal_at(address instr) {
+  unsigned insn = *(unsigned*)instr;
+  return (Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000;
+}
+
+bool NativeInstruction::is_ldrw_to_zr(address instr) {
+  unsigned insn = *(unsigned*)instr;
+  return (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
+          Instruction_aarch64::extract(insn, 4, 0) == 0b11111);
+}
+
+bool NativeInstruction::is_movz() {
+  return Instruction_aarch64::extract(int_at(0), 30, 23) == 0b10100101;
+}
+
+bool NativeInstruction::is_movk() {
+  return Instruction_aarch64::extract(int_at(0), 30, 23) == 0b11100101;
+}
+
+bool NativeInstruction::is_sigill_zombie_not_entrant() {
+  return uint_at(0) == 0xd4bbd5a1; // dcps1 #0xdead
+}
+
+void NativeIllegalInstruction::insert(address code_pos) {
+  *(juint*)code_pos = 0xd4bbd5a1; // dcps1 #0xdead
+}
+
+//-------------------------------------------------------------------
+
+// MT-safe inserting of a jump over a jump or a nop (used by
+// nmethod::make_not_entrant_or_zombie)
+
+void NativeJump::patch_verified_entry(address entry, address verified_entry, address dest) {
+
+  assert(dest == SharedRuntime::get_handle_wrong_method_stub(), "expected fixed destination of patch");
+  assert(nativeInstruction_at(verified_entry)->is_jump_or_nop()
+         || nativeInstruction_at(verified_entry)->is_sigill_zombie_not_entrant(),
+         "Aarch64 cannot replace non-jump with jump");
+
+  // Patch this nmethod atomically.
+  if (Assembler::reachable_from_branch_at(verified_entry, dest)) {
+    ptrdiff_t disp = dest - verified_entry;
+    guarantee(disp < 1 << 27 && disp > - (1 << 27), "branch overflow");
+
+    unsigned int insn = (0b000101 << 26) | ((disp >> 2) & 0x3ffffff);
+    *(unsigned int*)verified_entry = insn;
+  } else {
+    // We use an illegal instruction for marking a method as
+    // not_entrant or zombie.
+    NativeIllegalInstruction::insert(verified_entry);
+  }
+
+  ICache::invalidate_range(verified_entry, instruction_size);
+}
+
+void NativeGeneralJump::verify() {  }
+
+void NativeGeneralJump::insert_unconditional(address code_pos, address entry) {
+  NativeGeneralJump* n_jump = (NativeGeneralJump*)code_pos;
+
+  CodeBuffer cb(code_pos, instruction_size);
+  MacroAssembler a(&cb);
+
+  a.mov(rscratch1, entry);
+  a.br(rscratch1);
+
+  ICache::invalidate_range(code_pos, instruction_size);
+}
+
+// MT-safe patching of a long jump instruction.
+void NativeGeneralJump::replace_mt_safe(address instr_addr, address code_buffer) {
+  ShouldNotCallThis();
+}
+
+bool NativeInstruction::is_dtrace_trap() { return false; }
+
+address NativeCallTrampolineStub::destination(nmethod *nm) const {
+  return ptr_at(data_offset);
+}
+
+void NativeCallTrampolineStub::set_destination(address new_destination) {
+  set_ptr_at(data_offset, new_destination);
+  OrderAccess::release();
+}

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/nativeInst_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,491 @@
+/*
+ * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_NATIVEINST_AARCH64_HPP
+#define CPU_AARCH64_VM_NATIVEINST_AARCH64_HPP
+
+#include "asm/assembler.hpp"
+#include "memory/allocation.hpp"
+#include "runtime/icache.hpp"
+#include "runtime/os.hpp"
+#include "utilities/top.hpp"
+
+// We have interfaces for the following instructions:
+// - NativeInstruction
+// - - NativeCall
+// - - NativeMovConstReg
+// - - NativeMovConstRegPatching
+// - - NativeMovRegMem
+// - - NativeMovRegMemPatching
+// - - NativeJump
+// - - NativeIllegalOpCode
+// - - NativeGeneralJump
+// - - NativeReturn
+// - - NativeReturnX (return with argument)
+// - - NativePushConst
+// - - NativeTstRegMem
+
+// The base class for different kinds of native instruction abstractions.
+// Provides the primitive operations to manipulate code relative to this.
+
+class NativeInstruction VALUE_OBJ_CLASS_SPEC {
+  friend class Relocation;
+  friend bool is_NativeCallTrampolineStub_at(address);
+ public:
+  enum { instruction_size = 4 };
+  inline bool is_nop();
+  bool is_dtrace_trap();
+  inline bool is_illegal();
+  inline bool is_return();
+  bool is_jump();
+  inline bool is_jump_or_nop();
+  inline bool is_cond_jump();
+  bool is_safepoint_poll();
+  inline bool is_mov_literal64();
+  bool is_movz();
+  bool is_movk();
+  bool is_sigill_zombie_not_entrant();
+
+ protected:
+  address addr_at(int offset) const    { return address(this) + offset; }
+
+  s_char sbyte_at(int offset) const    { return *(s_char*) addr_at(offset); }
+  u_char ubyte_at(int offset) const    { return *(u_char*) addr_at(offset); }
+
+  jint int_at(int offset) const        { return *(jint*) addr_at(offset); }
+  juint uint_at(int offset) const      { return *(juint*) addr_at(offset); }
+
+  address ptr_at(int offset) const     { return *(address*) addr_at(offset); }
+
+  oop  oop_at (int offset) const       { return *(oop*) addr_at(offset); }
+
+
+  void set_char_at(int offset, char c)        { *addr_at(offset) = (u_char)c; }
+  void set_int_at(int offset, jint  i)        { *(jint*)addr_at(offset) = i; }
+  void set_uint_at(int offset, jint  i)       { *(juint*)addr_at(offset) = i; }
+  void set_ptr_at (int offset, address  ptr)  { *(address*) addr_at(offset) = ptr; }
+  void set_oop_at (int offset, oop  o)        { *(oop*) addr_at(offset) = o; }
+
+ public:
+
+  // unit test stuff
+  static void test() {}                 // override for testing
+
+  inline friend NativeInstruction* nativeInstruction_at(address address);
+
+  static bool is_adrp_at(address instr);
+  static bool is_ldr_literal_at(address instr);
+  static bool is_ldrw_to_zr(address instr);
+
+  static bool maybe_cpool_ref(address instr) {
+    return is_adrp_at(instr) || is_ldr_literal_at(instr);
+  }
+};
+
+inline NativeInstruction* nativeInstruction_at(address address) {
+  return (NativeInstruction*)address;
+}
+
+// The natural type of an AArch64 instruction is uint32_t
+inline NativeInstruction* nativeInstruction_at(uint32_t *address) {
+  return (NativeInstruction*)address;
+}
+
+inline NativeCall* nativeCall_at(address address);
+// The NativeCall is an abstraction for accessing/manipulating native call imm32/rel32off
+// instructions (used to manipulate inline caches, primitive & dll calls, etc.).
+
+class NativeCall: public NativeInstruction {
+ public:
+  enum Aarch64_specific_constants {
+    instruction_size            =    4,
+    instruction_offset          =    0,
+    displacement_offset         =    0,
+    return_address_offset       =    4
+  };
+
+  enum { cache_line_size = BytesPerWord };  // conservative estimate!
+  address instruction_address() const       { return addr_at(instruction_offset); }
+  address next_instruction_address() const  { return addr_at(return_address_offset); }
+  int   displacement() const                { return (int_at(displacement_offset) << 6) >> 4; }
+  address displacement_address() const      { return addr_at(displacement_offset); }
+  address return_address() const            { return addr_at(return_address_offset); }
+  address destination() const;
+
+  void  set_destination(address dest)       {
+    int offset = dest - instruction_address();
+    unsigned int insn = 0b100101 << 26;
+    assert((offset & 3) == 0, "should be");
+    offset >>= 2;
+    offset &= (1 << 26) - 1; // mask off insn part
+    insn |= offset;
+    set_int_at(displacement_offset, insn);
+  }
+
+  void  verify_alignment()                       { ; }
+  void  verify();
+  void  print();
+
+  // Creation
+  inline friend NativeCall* nativeCall_at(address address);
+  inline friend NativeCall* nativeCall_before(address return_address);
+
+  static bool is_call_at(address instr) {
+    const uint32_t insn = (*(uint32_t*)instr);
+    return (insn >> 26) == 0b100101;
+  }
+
+  static bool is_call_before(address return_address) {
+    return is_call_at(return_address - NativeCall::return_address_offset);
+  }
+
+  // MT-safe patching of a call instruction.
+  static void insert(address code_pos, address entry);
+
+  static void replace_mt_safe(address instr_addr, address code_buffer);
+
+  // Similar to replace_mt_safe, but just changes the destination.  The
+  // important thing is that free-running threads are able to execute
+  // this call instruction at all times.  If the call is an immediate BL
+  // instruction we can simply rely on atomicity of 32-bit writes to
+  // make sure other threads will see no intermediate states.
+
+  // We cannot rely on locks here, since the free-running threads must run at
+  // full speed.
+  //
+  // Used in the runtime linkage of calls; see class CompiledIC.
+  // (Cf. 4506997 and 4479829, where threads witnessed garbage displacements.)
+
+  // The parameter assert_lock disables the assertion during code generation.
+  void set_destination_mt_safe(address dest, bool assert_lock = true);
+
+  address get_trampoline();
+};
+
+inline NativeCall* nativeCall_at(address address) {
+  NativeCall* call = (NativeCall*)(address - NativeCall::instruction_offset);
+#ifdef ASSERT
+  call->verify();
+#endif
+  return call;
+}
+
+inline NativeCall* nativeCall_before(address return_address) {
+  NativeCall* call = (NativeCall*)(return_address - NativeCall::return_address_offset);
+#ifdef ASSERT
+  call->verify();
+#endif
+  return call;
+}
+
+// An interface for accessing/manipulating native mov reg, imm instructions.
+// (used to manipulate inlined 64-bit data calls, etc.)
+class NativeMovConstReg: public NativeInstruction {
+ public:
+  enum Aarch64_specific_constants {
+    instruction_size            =    3 * 4, // movz, movk, movk.  See movptr().
+    instruction_offset          =    0,
+    displacement_offset         =    0,
+  };
+
+  address instruction_address() const       { return addr_at(instruction_offset); }
+  address next_instruction_address() const  {
+    if (nativeInstruction_at(instruction_address())->is_movz())
+      // Assume movz, movk, movk
+      return addr_at(instruction_size);
+    else if (is_adrp_at(instruction_address()))
+      return addr_at(2*4);
+    else if (is_ldr_literal_at(instruction_address()))
+      return(addr_at(4));
+    assert(false, "Unknown instruction in NativeMovConstReg");
+    return NULL;
+  }
+
+  intptr_t data() const;
+  void  set_data(intptr_t x);
+
+  void flush() {
+    if (! maybe_cpool_ref(instruction_address())) {
+      ICache::invalidate_range(instruction_address(), instruction_size);
+    }
+  }
+
+  void  verify();
+  void  print();
+
+  // unit test stuff
+  static void test() {}
+
+  // Creation
+  inline friend NativeMovConstReg* nativeMovConstReg_at(address address);
+  inline friend NativeMovConstReg* nativeMovConstReg_before(address address);
+};
+
+inline NativeMovConstReg* nativeMovConstReg_at(address address) {
+  NativeMovConstReg* test = (NativeMovConstReg*)(address - NativeMovConstReg::instruction_offset);
+#ifdef ASSERT
+  test->verify();
+#endif
+  return test;
+}
+
+inline NativeMovConstReg* nativeMovConstReg_before(address address) {
+  NativeMovConstReg* test = (NativeMovConstReg*)(address - NativeMovConstReg::instruction_size - NativeMovConstReg::instruction_offset);
+#ifdef ASSERT
+  test->verify();
+#endif
+  return test;
+}
+
+class NativeMovConstRegPatching: public NativeMovConstReg {
+ private:
+    friend NativeMovConstRegPatching* nativeMovConstRegPatching_at(address address) {
+    NativeMovConstRegPatching* test = (NativeMovConstRegPatching*)(address - instruction_offset);
+    #ifdef ASSERT
+      test->verify();
+    #endif
+    return test;
+    }
+};
+
+// An interface for accessing/manipulating native moves of the form:
+//      mov[b/w/l/q] [reg + offset], reg   (instruction_code_reg2mem)
+//      mov[b/w/l/q] reg, [reg+offset]     (instruction_code_mem2reg
+//      mov[s/z]x[w/b/q] [reg + offset], reg
+//      fld_s  [reg+offset]
+//      fld_d  [reg+offset]
+//      fstp_s [reg + offset]
+//      fstp_d [reg + offset]
+//      mov_literal64  scratch,<pointer> ; mov[b/w/l/q] 0(scratch),reg | mov[b/w/l/q] reg,0(scratch)
+//
+// Warning: These routines must be able to handle any instruction sequences
+// that are generated as a result of the load/store byte,word,long
+// macros.  For example: The load_unsigned_byte instruction generates
+// an xor reg,reg inst prior to generating the movb instruction.  This
+// class must skip the xor instruction.
+
+class NativeMovRegMem: public NativeInstruction {
+  enum AArch64_specific_constants {
+    instruction_size            =    4,
+    instruction_offset          =    0,
+    data_offset                 =    0,
+    next_instruction_offset     =    4
+  };
+
+ public:
+  // helper
+  int instruction_start() const;
+
+  address instruction_address() const;
+
+  address next_instruction_address() const;
+
+  int   offset() const;
+
+  void  set_offset(int x);
+
+  void  add_offset_in_bytes(int add_offset)     { set_offset ( ( offset() + add_offset ) ); }
+
+  void verify();
+  void print ();
+
+  // unit test stuff
+  static void test() {}
+
+ private:
+  inline friend NativeMovRegMem* nativeMovRegMem_at (address address);
+};
+
+inline NativeMovRegMem* nativeMovRegMem_at (address address) {
+  NativeMovRegMem* test = (NativeMovRegMem*)(address - NativeMovRegMem::instruction_offset);
+#ifdef ASSERT
+  test->verify();
+#endif
+  return test;
+}
+
+class NativeMovRegMemPatching: public NativeMovRegMem {
+ private:
+  friend NativeMovRegMemPatching* nativeMovRegMemPatching_at (address address) {Unimplemented(); return 0;  }
+};
+
+// An interface for accessing/manipulating native leal instruction of form:
+//        leal reg, [reg + offset]
+
+class NativeLoadAddress: public NativeMovRegMem {
+  static const bool has_rex = true;
+  static const int rex_size = 1;
+ public:
+
+  void verify();
+  void print ();
+
+  // unit test stuff
+  static void test() {}
+};
+
+class NativeJump: public NativeInstruction {
+ public:
+  enum AArch64_specific_constants {
+    instruction_size            =    4,
+    instruction_offset          =    0,
+    data_offset                 =    0,
+    next_instruction_offset     =    4
+  };
+
+  address instruction_address() const       { return addr_at(instruction_offset); }
+  address next_instruction_address() const  { return addr_at(instruction_size); }
+  address jump_destination() const;
+  void set_jump_destination(address dest);
+
+  // Creation
+  inline friend NativeJump* nativeJump_at(address address);
+
+  void verify();
+
+  // Unit testing stuff
+  static void test() {}
+
+  // Insertion of native jump instruction
+  static void insert(address code_pos, address entry);
+  // MT-safe insertion of native jump at verified method entry
+  static void check_verified_entry_alignment(address entry, address verified_entry);
+  static void patch_verified_entry(address entry, address verified_entry, address dest);
+};
+
+inline NativeJump* nativeJump_at(address address) {
+  NativeJump* jump = (NativeJump*)(address - NativeJump::instruction_offset);
+#ifdef ASSERT
+  jump->verify();
+#endif
+  return jump;
+}
+
+class NativeGeneralJump: public NativeJump {
+public:
+  enum AArch64_specific_constants {
+    instruction_size            =    4 * 4,
+    instruction_offset          =    0,
+    data_offset                 =    0,
+    next_instruction_offset     =    4 * 4
+  };
+  static void insert_unconditional(address code_pos, address entry);
+  static void replace_mt_safe(address instr_addr, address code_buffer);
+  static void verify();
+};
+
+inline NativeGeneralJump* nativeGeneralJump_at(address address) {
+  NativeGeneralJump* jump = (NativeGeneralJump*)(address);
+  debug_only(jump->verify();)
+  return jump;
+}
+
+class NativePopReg : public NativeInstruction {
+ public:
+  // Insert a pop instruction
+  static void insert(address code_pos, Register reg);
+};
+
+
+class NativeIllegalInstruction: public NativeInstruction {
+ public:
+  // Insert illegal opcode as specific address
+  static void insert(address code_pos);
+};
+
+// return instruction that does not pop values of the stack
+class NativeReturn: public NativeInstruction {
+ public:
+};
+
+// return instruction that does pop values of the stack
+class NativeReturnX: public NativeInstruction {
+ public:
+};
+
+// Simple test vs memory
+class NativeTstRegMem: public NativeInstruction {
+ public:
+};
+
+inline bool NativeInstruction::is_nop()         {
+  uint32_t insn = *(uint32_t*)addr_at(0);
+  return insn == 0xd503201f;
+}
+
+inline bool NativeInstruction::is_jump() {
+  uint32_t insn = *(uint32_t*)addr_at(0);
+
+  if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
+    // Unconditional branch (immediate)
+    return true;
+  } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
+    // Conditional branch (immediate)
+    return true;
+  } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
+    // Compare & branch (immediate)
+    return true;
+  } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
+    // Test & branch (immediate)
+    return true;
+  } else
+    return false;
+}
+
+inline bool NativeInstruction::is_jump_or_nop() {
+  return is_nop() || is_jump();
+}
+
+// Call trampoline stubs.
+class NativeCallTrampolineStub : public NativeInstruction {
+ public:
+
+  enum AArch64_specific_constants {
+    instruction_size            =    4 * 4,
+    instruction_offset          =    0,
+    data_offset                 =    2 * 4,
+    next_instruction_offset     =    4 * 4
+  };
+
+  address destination(nmethod *nm = NULL) const;
+  void set_destination(address new_destination);
+  ptrdiff_t destination_offset() const;
+};
+
+inline bool is_NativeCallTrampolineStub_at(address addr) {
+  // Ensure that the stub is exactly
+  //      ldr   xscratch1, L
+  //      br    xscratch1
+  // L:
+  uint32_t *i = (uint32_t *)addr;
+  return i[0] == 0x58000048 && i[1] == 0xd61f0100;
+}
+
+inline NativeCallTrampolineStub* nativeCallTrampolineStub_at(address addr) {
+  assert(is_NativeCallTrampolineStub_at(addr), "no call trampoline found");
+  return (NativeCallTrampolineStub*)addr;
+}
+
+#endif // CPU_AARCH64_VM_NATIVEINST_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/registerMap_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_REGISTERMAP_AARCH64_HPP
+#define CPU_AARCH64_VM_REGISTERMAP_AARCH64_HPP
+
+// machine-dependent implemention for register maps
+  friend class frame;
+
+ private:
+  // This is the hook for finding a register in an "well-known" location,
+  // such as a register block of a predetermined format.
+  // Since there is none, we just return NULL.
+  // See registerMap_sparc.hpp for an example of grabbing registers
+  // from register save areas of a standard layout.
+   address pd_location(VMReg reg) const {return NULL;}
+
+  // no PD state to clear or copy:
+  void pd_clear() {}
+  void pd_initialize() {}
+  void pd_initialize_from(const RegisterMap* map) {}
+
+#endif // CPU_AARCH64_VM_REGISTERMAP_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/register_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "register_aarch64.hpp"
+
+const int ConcreteRegisterImpl::max_gpr = RegisterImpl::number_of_registers << 1;
+
+const int ConcreteRegisterImpl::max_fpr
+  = ConcreteRegisterImpl::max_gpr + (FloatRegisterImpl::number_of_registers << 1);
+
+const char* RegisterImpl::name() const {
+  const char* names[number_of_registers] = {
+    "c_rarg0", "c_rarg1", "c_rarg2", "c_rarg3", "c_rarg4", "c_rarg5", "c_rarg6", "c_rarg7",
+    "rscratch1", "rscratch2",
+    "r10", "r11", "r12", "r13", "r14", "r15", "r16",
+    "r17", "r18", "r19",
+    "resp", "rdispatch", "rbcp", "r23", "rlocals", "rmonitors", "rcpool", "rheapbase",
+    "rthread", "rfp", "lr", "sp"
+  };
+  return is_valid() ? names[encoding()] : "noreg";
+}
+
+const char* FloatRegisterImpl::name() const {
+  const char* names[number_of_registers] = {
+    "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+    "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+    "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+    "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+  };
+  return is_valid() ? names[encoding()] : "noreg";
+}

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/register_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_REGISTER_AARCH64_HPP
+#define CPU_AARCH64_VM_REGISTER_AARCH64_HPP
+
+#include "asm/register.hpp"
+
+class VMRegImpl;
+typedef VMRegImpl* VMReg;
+
+// Use Register as shortcut
+class RegisterImpl;
+typedef RegisterImpl* Register;
+
+inline Register as_Register(int encoding) {
+  return (Register)(intptr_t) encoding;
+}
+
+class RegisterImpl: public AbstractRegisterImpl {
+ public:
+  enum {
+    number_of_registers      = 32,
+    number_of_byte_registers = 32
+  };
+
+  // derived registers, offsets, and addresses
+  Register successor() const                          { return as_Register(encoding() + 1); }
+
+  // construction
+  inline friend Register as_Register(int encoding);
+
+  VMReg as_VMReg();
+
+  // accessors
+  int   encoding() const                         { assert(is_valid(), "invalid register"); return (intptr_t)this; }
+  bool  is_valid() const                         { return 0 <= (intptr_t)this && (intptr_t)this < number_of_registers; }
+  bool  has_byte_register() const                { return 0 <= (intptr_t)this && (intptr_t)this < number_of_byte_registers; }
+  const char* name() const;
+  int   encoding_nocheck() const                 { return (intptr_t)this; }
+  unsigned long bit(bool yes = true) const       { return yes << encoding(); }
+};
+
+// The integer registers of the aarch64 architecture
+
+CONSTANT_REGISTER_DECLARATION(Register, noreg, (-1));
+
+
+CONSTANT_REGISTER_DECLARATION(Register, r0,    (0));
+CONSTANT_REGISTER_DECLARATION(Register, r1,    (1));
+CONSTANT_REGISTER_DECLARATION(Register, r2,    (2));
+CONSTANT_REGISTER_DECLARATION(Register, r3,    (3));
+CONSTANT_REGISTER_DECLARATION(Register, r4,    (4));
+CONSTANT_REGISTER_DECLARATION(Register, r5,    (5));
+CONSTANT_REGISTER_DECLARATION(Register, r6,    (6));
+CONSTANT_REGISTER_DECLARATION(Register, r7,    (7));
+CONSTANT_REGISTER_DECLARATION(Register, r8,    (8));
+CONSTANT_REGISTER_DECLARATION(Register, r9,    (9));
+CONSTANT_REGISTER_DECLARATION(Register, r10,  (10));
+CONSTANT_REGISTER_DECLARATION(Register, r11,  (11));
+CONSTANT_REGISTER_DECLARATION(Register, r12,  (12));
+CONSTANT_REGISTER_DECLARATION(Register, r13,  (13));
+CONSTANT_REGISTER_DECLARATION(Register, r14,  (14));
+CONSTANT_REGISTER_DECLARATION(Register, r15,  (15));
+CONSTANT_REGISTER_DECLARATION(Register, r16,  (16));
+CONSTANT_REGISTER_DECLARATION(Register, r17,  (17));
+CONSTANT_REGISTER_DECLARATION(Register, r18,  (18));
+CONSTANT_REGISTER_DECLARATION(Register, r19,  (19));
+CONSTANT_REGISTER_DECLARATION(Register, r20,  (20));
+CONSTANT_REGISTER_DECLARATION(Register, r21,  (21));
+CONSTANT_REGISTER_DECLARATION(Register, r22,  (22));
+CONSTANT_REGISTER_DECLARATION(Register, r23,  (23));
+CONSTANT_REGISTER_DECLARATION(Register, r24,  (24));
+CONSTANT_REGISTER_DECLARATION(Register, r25,  (25));
+CONSTANT_REGISTER_DECLARATION(Register, r26,  (26));
+CONSTANT_REGISTER_DECLARATION(Register, r27,  (27));
+CONSTANT_REGISTER_DECLARATION(Register, r28,  (28));
+CONSTANT_REGISTER_DECLARATION(Register, r29,  (29));
+CONSTANT_REGISTER_DECLARATION(Register, r30,  (30));
+
+CONSTANT_REGISTER_DECLARATION(Register, r31_sp, (31));
+CONSTANT_REGISTER_DECLARATION(Register, zr,  (32));
+CONSTANT_REGISTER_DECLARATION(Register, sp,  (33));
+
+// Use FloatRegister as shortcut
+class FloatRegisterImpl;
+typedef FloatRegisterImpl* FloatRegister;
+
+inline FloatRegister as_FloatRegister(int encoding) {
+  return (FloatRegister)(intptr_t) encoding;
+}
+
+// The implementation of floating point registers for the architecture
+class FloatRegisterImpl: public AbstractRegisterImpl {
+ public:
+  enum {
+    number_of_registers = 32
+  };
+
+  // construction
+  inline friend FloatRegister as_FloatRegister(int encoding);
+
+  VMReg as_VMReg();
+
+  // derived registers, offsets, and addresses
+  FloatRegister successor() const                          { return as_FloatRegister(encoding() + 1); }
+
+  // accessors
+  int   encoding() const                          { assert(is_valid(), "invalid register"); return (intptr_t)this; }
+  int   encoding_nocheck() const                         { return (intptr_t)this; }
+  bool  is_valid() const                          { return 0 <= (intptr_t)this && (intptr_t)this < number_of_registers; }
+  const char* name() const;
+
+};
+
+// The float registers of the AARCH64 architecture
+
+CONSTANT_REGISTER_DECLARATION(FloatRegister, fnoreg , (-1));
+
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v0     , ( 0));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v1     , ( 1));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v2     , ( 2));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v3     , ( 3));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v4     , ( 4));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v5     , ( 5));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v6     , ( 6));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v7     , ( 7));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v8     , ( 8));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v9     , ( 9));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v10    , (10));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v11    , (11));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v12    , (12));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v13    , (13));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v14    , (14));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v15    , (15));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v16    , (16));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v17    , (17));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v18    , (18));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v19    , (19));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v20    , (20));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v21    , (21));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v22    , (22));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v23    , (23));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v24    , (24));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v25    , (25));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v26    , (26));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v27    , (27));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v28    , (28));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v29    , (29));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v30    , (30));
+CONSTANT_REGISTER_DECLARATION(FloatRegister, v31    , (31));
+
+// Need to know the total number of registers of all sorts for SharedInfo.
+// Define a class that exports it.
+class ConcreteRegisterImpl : public AbstractRegisterImpl {
+ public:
+  enum {
+  // A big enough number for C2: all the registers plus flags
+  // This number must be large enough to cover REG_COUNT (defined by c2) registers.
+  // There is no requirement that any ordering here matches any ordering c2 gives
+  // it's optoregs.
+
+    number_of_registers = (2 * RegisterImpl::number_of_registers +
+                           2 * FloatRegisterImpl::number_of_registers +
+                           1) // flags
+  };
+
+  // added to make it compile
+  static const int max_gpr;
+  static const int max_fpr;
+};
+
+// A set of registers
+class RegSet {
+  uint32_t _bitset;
+
+  RegSet(uint32_t bitset) : _bitset(bitset) { }
+
+public:
+
+  RegSet() : _bitset(0) { }
+
+  RegSet(Register r1) : _bitset(r1->bit()) { }
+
+  RegSet operator+(const RegSet aSet) const {
+    RegSet result(_bitset | aSet._bitset);
+    return result;
+  }
+
+  RegSet operator-(const RegSet aSet) const {
+    RegSet result(_bitset & ~aSet._bitset);
+    return result;
+  }
+
+  RegSet &operator+=(const RegSet aSet) {
+    *this = *this + aSet;
+    return *this;
+  }
+
+  static RegSet of(Register r1) {
+    return RegSet(r1);
+  }
+
+  static RegSet of(Register r1, Register r2) {
+    return of(r1) + r2;
+  }
+
+  static RegSet of(Register r1, Register r2, Register r3) {
+    return of(r1, r2) + r3;
+  }
+
+  static RegSet of(Register r1, Register r2, Register r3, Register r4) {
+    return of(r1, r2, r3) + r4;
+  }
+
+  static RegSet range(Register start, Register end) {
+    uint32_t bits = ~0;
+    bits <<= start->encoding();
+    bits <<= 31 - end->encoding();
+    bits >>= 31 - end->encoding();
+
+    return RegSet(bits);
+  }
+
+  uint32_t bits() const { return _bitset; }
+};
+
+#endif // CPU_AARCH64_VM_REGISTER_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/register_definitions_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "asm/register.hpp"
+#include "register_aarch64.hpp"
+# include "interp_masm_aarch64.hpp"
+
+REGISTER_DEFINITION(Register, r0);
+REGISTER_DEFINITION(Register, r1);
+REGISTER_DEFINITION(Register, r2);
+REGISTER_DEFINITION(Register, r3);
+REGISTER_DEFINITION(Register, r4);
+REGISTER_DEFINITION(Register, r5);
+REGISTER_DEFINITION(Register, r6);
+REGISTER_DEFINITION(Register, r7);
+REGISTER_DEFINITION(Register, r8);
+REGISTER_DEFINITION(Register, r9);
+REGISTER_DEFINITION(Register, r10);
+REGISTER_DEFINITION(Register, r11);
+REGISTER_DEFINITION(Register, r12);
+REGISTER_DEFINITION(Register, r13);
+REGISTER_DEFINITION(Register, r14);
+REGISTER_DEFINITION(Register, r15);
+REGISTER_DEFINITION(Register, r16);
+REGISTER_DEFINITION(Register, r17);
+REGISTER_DEFINITION(Register, r18);
+REGISTER_DEFINITION(Register, r19);
+REGISTER_DEFINITION(Register, r20);
+REGISTER_DEFINITION(Register, r21);
+REGISTER_DEFINITION(Register, r22);
+REGISTER_DEFINITION(Register, r23);
+REGISTER_DEFINITION(Register, r24);
+REGISTER_DEFINITION(Register, r25);
+REGISTER_DEFINITION(Register, r26);
+REGISTER_DEFINITION(Register, r27);
+REGISTER_DEFINITION(Register, r28);
+REGISTER_DEFINITION(Register, r29);
+REGISTER_DEFINITION(Register, r30);
+REGISTER_DEFINITION(Register, sp);
+
+REGISTER_DEFINITION(FloatRegister, v0);
+REGISTER_DEFINITION(FloatRegister, v1);
+REGISTER_DEFINITION(FloatRegister, v2);
+REGISTER_DEFINITION(FloatRegister, v3);
+REGISTER_DEFINITION(FloatRegister, v4);
+REGISTER_DEFINITION(FloatRegister, v5);
+REGISTER_DEFINITION(FloatRegister, v6);
+REGISTER_DEFINITION(FloatRegister, v7);
+REGISTER_DEFINITION(FloatRegister, v8);
+REGISTER_DEFINITION(FloatRegister, v9);
+REGISTER_DEFINITION(FloatRegister, v10);
+REGISTER_DEFINITION(FloatRegister, v11);
+REGISTER_DEFINITION(FloatRegister, v12);
+REGISTER_DEFINITION(FloatRegister, v13);
+REGISTER_DEFINITION(FloatRegister, v14);
+REGISTER_DEFINITION(FloatRegister, v15);
+REGISTER_DEFINITION(FloatRegister, v16);
+REGISTER_DEFINITION(FloatRegister, v17);
+REGISTER_DEFINITION(FloatRegister, v18);
+REGISTER_DEFINITION(FloatRegister, v19);
+REGISTER_DEFINITION(FloatRegister, v20);
+REGISTER_DEFINITION(FloatRegister, v21);
+REGISTER_DEFINITION(FloatRegister, v22);
+REGISTER_DEFINITION(FloatRegister, v23);
+REGISTER_DEFINITION(FloatRegister, v24);
+REGISTER_DEFINITION(FloatRegister, v25);
+REGISTER_DEFINITION(FloatRegister, v26);
+REGISTER_DEFINITION(FloatRegister, v27);
+REGISTER_DEFINITION(FloatRegister, v28);
+REGISTER_DEFINITION(FloatRegister, v29);
+REGISTER_DEFINITION(FloatRegister, v30);
+REGISTER_DEFINITION(FloatRegister, v31);
+
+REGISTER_DEFINITION(Register, zr);
+
+REGISTER_DEFINITION(Register, c_rarg0);
+REGISTER_DEFINITION(Register, c_rarg1);
+REGISTER_DEFINITION(Register, c_rarg2);
+REGISTER_DEFINITION(Register, c_rarg3);
+REGISTER_DEFINITION(Register, c_rarg4);
+REGISTER_DEFINITION(Register, c_rarg5);
+REGISTER_DEFINITION(Register, c_rarg6);
+REGISTER_DEFINITION(Register, c_rarg7);
+
+REGISTER_DEFINITION(FloatRegister, c_farg0);
+REGISTER_DEFINITION(FloatRegister, c_farg1);
+REGISTER_DEFINITION(FloatRegister, c_farg2);
+REGISTER_DEFINITION(FloatRegister, c_farg3);
+REGISTER_DEFINITION(FloatRegister, c_farg4);
+REGISTER_DEFINITION(FloatRegister, c_farg5);
+REGISTER_DEFINITION(FloatRegister, c_farg6);
+REGISTER_DEFINITION(FloatRegister, c_farg7);
+
+REGISTER_DEFINITION(Register, j_rarg0);
+REGISTER_DEFINITION(Register, j_rarg1);
+REGISTER_DEFINITION(Register, j_rarg2);
+REGISTER_DEFINITION(Register, j_rarg3);
+REGISTER_DEFINITION(Register, j_rarg4);
+REGISTER_DEFINITION(Register, j_rarg5);
+REGISTER_DEFINITION(Register, j_rarg6);
+REGISTER_DEFINITION(Register, j_rarg7);
+
+REGISTER_DEFINITION(FloatRegister, j_farg0);
+REGISTER_DEFINITION(FloatRegister, j_farg1);
+REGISTER_DEFINITION(FloatRegister, j_farg2);
+REGISTER_DEFINITION(FloatRegister, j_farg3);
+REGISTER_DEFINITION(FloatRegister, j_farg4);
+REGISTER_DEFINITION(FloatRegister, j_farg5);
+REGISTER_DEFINITION(FloatRegister, j_farg6);
+REGISTER_DEFINITION(FloatRegister, j_farg7);
+
+REGISTER_DEFINITION(Register, rscratch1);
+REGISTER_DEFINITION(Register, rscratch2);
+REGISTER_DEFINITION(Register, esp);
+REGISTER_DEFINITION(Register, rdispatch);
+REGISTER_DEFINITION(Register, rcpool);
+REGISTER_DEFINITION(Register, rmonitors);
+REGISTER_DEFINITION(Register, rlocals);
+REGISTER_DEFINITION(Register, rmethod);
+REGISTER_DEFINITION(Register, rbcp);
+
+REGISTER_DEFINITION(Register, lr);
+REGISTER_DEFINITION(Register, rfp);
+REGISTER_DEFINITION(Register, rthread);
+REGISTER_DEFINITION(Register, rheapbase);
+
+REGISTER_DEFINITION(Register, r31_sp);
+
+// TODO : x86 uses rbp to save SP in method handle code
+// we may need to do the same with fp
+// REGISTER_DEFINITION(Register, rbp_mh_SP_save)

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/relocInfo_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 1998, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "code/relocInfo.hpp"
+#include "nativeInst_aarch64.hpp"
+#include "oops/oop.inline.hpp"
+#include "runtime/safepoint.hpp"
+
+
+void Relocation::pd_set_data_value(address x, intptr_t o, bool verify_only) {
+  if (verify_only)
+    return;
+
+  int bytes;
+
+  switch(type()) {
+  case relocInfo::oop_type:
+    {
+      oop_Relocation *reloc = (oop_Relocation *)this;
+      if (NativeInstruction::is_ldr_literal_at(addr())) {
+        address constptr = (address)code()->oop_addr_at(reloc->oop_index());
+        bytes = MacroAssembler::pd_patch_instruction_size(addr(), constptr);
+        assert(*(address*)constptr == x, "error in oop relocation");
+      } else{
+        bytes = MacroAssembler::patch_oop(addr(), x);
+      }
+    }
+    break;
+  default:
+    bytes = MacroAssembler::pd_patch_instruction_size(addr(), x);
+    break;
+  }
+  ICache::invalidate_range(addr(), bytes);
+}
+
+address Relocation::pd_call_destination(address orig_addr) {
+  assert(is_call(), "should be a call here");
+  if (is_call()) {
+    address trampoline = nativeCall_at(addr())->get_trampoline();
+    if (trampoline) {
+      return nativeCallTrampolineStub_at(trampoline)->destination();
+    }
+  }
+  if (orig_addr != NULL) {
+    return MacroAssembler::pd_call_destination(orig_addr);
+  }
+  return MacroAssembler::pd_call_destination(addr());
+}
+
+
+void Relocation::pd_set_call_destination(address x) {
+  assert(is_call(), "should be a call here");
+  if (NativeCall::is_call_at(addr())) {
+    address trampoline = nativeCall_at(addr())->get_trampoline();
+    if (trampoline) {
+      nativeCall_at(addr())->set_destination_mt_safe(x, /* assert_lock */false);
+      return;
+    }
+  }
+  assert(addr() != x, "call instruction in an infinite loop");
+  MacroAssembler::pd_patch_instruction(addr(), x);
+  assert(pd_call_destination(addr()) == x, "fail in reloc");
+}
+
+address* Relocation::pd_address_in_code() {
+  return (address*)(addr() + 8);
+}
+
+
+address Relocation::pd_get_address_from_code() {
+  return MacroAssembler::pd_call_destination(addr());
+}
+
+void poll_Relocation::fix_relocation_after_move(const CodeBuffer* src, CodeBuffer* dest) {
+  if (NativeInstruction::maybe_cpool_ref(addr())) {
+    address old_addr = old_addr_for(addr(), src, dest);
+    MacroAssembler::pd_patch_instruction(addr(), MacroAssembler::target_addr_for_insn(old_addr));
+  }
+}
+
+void poll_return_Relocation::fix_relocation_after_move(const CodeBuffer* src, CodeBuffer* dest)  {
+  if (NativeInstruction::maybe_cpool_ref(addr())) {
+    address old_addr = old_addr_for(addr(), src, dest);
+    MacroAssembler::pd_patch_instruction(addr(), MacroAssembler::target_addr_for_insn(old_addr));
+  }
+}
+
+void metadata_Relocation::pd_fix_value(address x) {
+}

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/relocInfo_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_RELOCINFO_AARCH64_HPP
+#define CPU_AARCH64_VM_RELOCINFO_AARCH64_HPP
+
+  // machine-dependent parts of class relocInfo
+ private:
+  enum {
+    // Relocations are byte-aligned.
+    offset_unit        =  1,
+    // We don't use format().
+    format_width       =  0
+  };
+
+#endif // CPU_AARCH64_VM_RELOCINFO_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/runtime_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#ifdef COMPILER2
+#include "asm/macroAssembler.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "classfile/systemDictionary.hpp"
+#include "code/vmreg.hpp"
+#include "interpreter/interpreter.hpp"
+#include "opto/runtime.hpp"
+#include "runtime/interfaceSupport.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "runtime/vframeArray.hpp"
+#include "utilities/globalDefinitions.hpp"
+#include "vmreg_aarch64.inline.hpp"
+#endif
+
+

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,3063 @@
+/*
+ * Copyright (c) 2003, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "code/debugInfoRec.hpp"
+#include "code/icBuffer.hpp"
+#include "code/vtableStubs.hpp"
+#include "interpreter/interpreter.hpp"
+#include "interpreter/interp_masm.hpp"
+#include "oops/compiledICHolder.hpp"
+#include "prims/jvmtiRedefineClassesTrace.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/vframeArray.hpp"
+#include "vmreg_aarch64.inline.hpp"
+#ifdef COMPILER1
+#include "c1/c1_Runtime1.hpp"
+#endif
+#ifdef COMPILER2
+#include "adfiles/ad_aarch64.hpp"
+#include "opto/runtime.hpp"
+#endif
+
+#ifdef BUILTIN_SIM
+#include "../../../../../../simulator/simulator.hpp"
+#endif
+
+#define __ masm->
+
+const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
+
+class SimpleRuntimeFrame {
+
+  public:
+
+  // Most of the runtime stubs have this simple frame layout.
+  // This class exists to make the layout shared in one place.
+  // Offsets are for compiler stack slots, which are jints.
+  enum layout {
+    // The frame sender code expects that rbp will be in the "natural" place and
+    // will override any oopMap setting for it. We must therefore force the layout
+    // so that it agrees with the frame sender code.
+    // we don't expect any arg reg save area so aarch64 asserts that
+    // frame::arg_reg_save_area_bytes == 0
+    rbp_off = 0,
+    rbp_off2,
+    return_off, return_off2,
+    framesize
+  };
+};
+
+// FIXME -- this is used by C1
+class RegisterSaver {
+ public:
+  static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words);
+  static void restore_live_registers(MacroAssembler* masm);
+
+  // Offsets into the register save area
+  // Used by deoptimization when it is managing result register
+  // values on its own
+
+  static int r0_offset_in_bytes(void)    { return (32 + r0->encoding()) * wordSize; }
+  static int reg_offset_in_bytes(Register r)    { return r0_offset_in_bytes() + r->encoding() * wordSize; }
+  static int rmethod_offset_in_bytes(void)    { return reg_offset_in_bytes(rmethod); }
+  static int rscratch1_offset_in_bytes(void)    { return (32 + rscratch1->encoding()) * wordSize; }
+  static int v0_offset_in_bytes(void)   { return 0; }
+  static int return_offset_in_bytes(void) { return (32 /* floats*/ + 31 /* gregs*/) * wordSize; }
+
+  // During deoptimization only the result registers need to be restored,
+  // all the other values have already been extracted.
+  static void restore_result_registers(MacroAssembler* masm);
+
+    // Capture info about frame layout
+  enum layout {
+                fpu_state_off = 0,
+                fpu_state_end = fpu_state_off+FPUStateSizeInWords-1,
+                // The frame sender code expects that rfp will be in
+                // the "natural" place and will override any oopMap
+                // setting for it. We must therefore force the layout
+                // so that it agrees with the frame sender code.
+                r0_off = fpu_state_off+FPUStateSizeInWords,
+                rfp_off = r0_off + 30 * 2,
+                return_off = rfp_off + 2,      // slot for return address
+                reg_save_size = return_off + 2};
+
+};
+
+OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words) {
+  int frame_size_in_bytes = round_to(additional_frame_words*wordSize +
+                                     reg_save_size*BytesPerInt, 16);
+  // OopMap frame size is in compiler stack slots (jint's) not bytes or words
+  int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
+  // The caller will allocate additional_frame_words
+  int additional_frame_slots = additional_frame_words*wordSize / BytesPerInt;
+  // CodeBlob frame size is in words.
+  int frame_size_in_words = frame_size_in_bytes / wordSize;
+  *total_frame_words = frame_size_in_words;
+
+  // Save registers, fpu state, and flags.
+
+  __ enter();
+  __ push_CPU_state();
+
+  // Set an oopmap for the call site.  This oopmap will map all
+  // oop-registers and debug-info registers as callee-saved.  This
+  // will allow deoptimization at this safepoint to find all possible
+  // debug-info recordings, as well as let GC find all oops.
+
+  OopMapSet *oop_maps = new OopMapSet();
+  OopMap* oop_map = new OopMap(frame_size_in_slots, 0);
+
+  for (int i = 0; i < RegisterImpl::number_of_registers; i++) {
+    Register r = as_Register(i);
+    if (r < rheapbase && r != rscratch1 && r != rscratch2) {
+      int sp_offset = 2 * (i + 32); // SP offsets are in 4-byte words,
+                                    // register slots are 8 bytes
+                                    // wide, 32 floating-point
+                                    // registers
+      oop_map->set_callee_saved(VMRegImpl::stack2reg(sp_offset),
+                                r->as_VMReg());
+    }
+  }
+
+  for (int i = 0; i < FloatRegisterImpl::number_of_registers; i++) {
+    FloatRegister r = as_FloatRegister(i);
+    int sp_offset = 2 * i;
+    oop_map->set_callee_saved(VMRegImpl::stack2reg(sp_offset),
+                              r->as_VMReg());
+  }
+
+  return oop_map;
+}
+
+void RegisterSaver::restore_live_registers(MacroAssembler* masm) {
+  __ pop_CPU_state();
+  __ leave();
+}
+
+void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
+
+  // Just restore result register. Only used by deoptimization. By
+  // now any callee save register that needs to be restored to a c2
+  // caller of the deoptee has been extracted into the vframeArray
+  // and will be stuffed into the c2i adapter we create for later
+  // restoration so only result registers need to be restored here.
+
+  // Restore fp result register
+  __ ldrd(v0, Address(sp, v0_offset_in_bytes()));
+  // Restore integer result register
+  __ ldr(r0, Address(sp, r0_offset_in_bytes()));
+
+  // Pop all of the register save are off the stack
+  __ add(sp, sp, round_to(return_offset_in_bytes(), 16));
+}
+
+// Is vector's size (in bytes) bigger than a size saved by default?
+// 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
+bool SharedRuntime::is_wide_vector(int size) {
+  return size > 16;
+}
+// The java_calling_convention describes stack locations as ideal slots on
+// a frame with no abi restrictions. Since we must observe abi restrictions
+// (like the placement of the register window) the slots must be biased by
+// the following value.
+static int reg2offset_in(VMReg r) {
+  // Account for saved rfp and lr
+  // This should really be in_preserve_stack_slots
+  return (r->reg2stack() + 4) * VMRegImpl::stack_slot_size;
+}
+
+static int reg2offset_out(VMReg r) {
+  return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
+}
+
+template <class T> static const T& min (const T& a, const T& b) {
+  return (a > b) ? b : a;
+}
+
+// ---------------------------------------------------------------------------
+// Read the array of BasicTypes from a signature, and compute where the
+// arguments should go.  Values in the VMRegPair regs array refer to 4-byte
+// quantities.  Values less than VMRegImpl::stack0 are registers, those above
+// refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
+// as framesizes are fixed.
+// VMRegImpl::stack0 refers to the first slot 0(sp).
+// and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
+// up to RegisterImpl::number_of_registers) are the 64-bit
+// integer registers.
+
+// Note: the INPUTS in sig_bt are in units of Java argument words,
+// which are 64-bit.  The OUTPUTS are in 32-bit units.
+
+// The Java calling convention is a "shifted" version of the C ABI.
+// By skipping the first C ABI register we can call non-static jni
+// methods with small numbers of arguments without having to shuffle
+// the arguments at all. Since we control the java ABI we ought to at
+// least get some advantage out of it.
+
+int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
+                                           VMRegPair *regs,
+                                           int total_args_passed,
+                                           int is_outgoing) {
+
+  // Create the mapping between argument positions and
+  // registers.
+  static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
+    j_rarg0, j_rarg1, j_rarg2, j_rarg3, j_rarg4, j_rarg5, j_rarg6, j_rarg7
+  };
+  static const FloatRegister FP_ArgReg[Argument::n_float_register_parameters_j] = {
+    j_farg0, j_farg1, j_farg2, j_farg3,
+    j_farg4, j_farg5, j_farg6, j_farg7
+  };
+
+
+  uint int_args = 0;
+  uint fp_args = 0;
+  uint stk_args = 0; // inc by 2 each time
+
+  for (int i = 0; i < total_args_passed; i++) {
+    switch (sig_bt[i]) {
+    case T_BOOLEAN:
+    case T_CHAR:
+    case T_BYTE:
+    case T_SHORT:
+    case T_INT:
+      if (int_args < Argument::n_int_register_parameters_j) {
+        regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
+      } else {
+        regs[i].set1(VMRegImpl::stack2reg(stk_args));
+        stk_args += 2;
+      }
+      break;
+    case T_VOID:
+      // halves of T_LONG or T_DOUBLE
+      assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
+      regs[i].set_bad();
+      break;
+    case T_LONG:
+      assert(sig_bt[i + 1] == T_VOID, "expecting half");
+      // fall through
+    case T_OBJECT:
+    case T_ARRAY:
+    case T_ADDRESS:
+      if (int_args < Argument::n_int_register_parameters_j) {
+        regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
+      } else {
+        regs[i].set2(VMRegImpl::stack2reg(stk_args));
+        stk_args += 2;
+      }
+      break;
+    case T_FLOAT:
+      if (fp_args < Argument::n_float_register_parameters_j) {
+        regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
+      } else {
+        regs[i].set1(VMRegImpl::stack2reg(stk_args));
+        stk_args += 2;
+      }
+      break;
+    case T_DOUBLE:
+      assert(sig_bt[i + 1] == T_VOID, "expecting half");
+      if (fp_args < Argument::n_float_register_parameters_j) {
+        regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
+      } else {
+        regs[i].set2(VMRegImpl::stack2reg(stk_args));
+        stk_args += 2;
+      }
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+    }
+  }
+
+  return round_to(stk_args, 2);
+}
+
+// Patch the callers callsite with entry to compiled code if it exists.
+static void patch_callers_callsite(MacroAssembler *masm) {
+  Label L;
+  __ ldr(rscratch1, Address(rmethod, in_bytes(Method::code_offset())));
+  __ cbz(rscratch1, L);
+
+  __ enter();
+  __ push_CPU_state();
+
+  // VM needs caller's callsite
+  // VM needs target method
+  // This needs to be a long call since we will relocate this adapter to
+  // the codeBuffer and it may not reach
+
+#ifndef PRODUCT
+  assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
+#endif
+
+  __ mov(c_rarg0, rmethod);
+  __ mov(c_rarg1, lr);
+  __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
+  __ blrt(rscratch1, 2, 0, 0);
+  __ maybe_isb();
+
+  __ pop_CPU_state();
+  // restore sp
+  __ leave();
+  __ bind(L);
+}
+
+static void gen_c2i_adapter(MacroAssembler *masm,
+                            int total_args_passed,
+                            int comp_args_on_stack,
+                            const BasicType *sig_bt,
+                            const VMRegPair *regs,
+                            Label& skip_fixup) {
+  // Before we get into the guts of the C2I adapter, see if we should be here
+  // at all.  We've come from compiled code and are attempting to jump to the
+  // interpreter, which means the caller made a static call to get here
+  // (vcalls always get a compiled target if there is one).  Check for a
+  // compiled target.  If there is one, we need to patch the caller's call.
+  patch_callers_callsite(masm);
+
+  __ bind(skip_fixup);
+
+  int words_pushed = 0;
+
+  // Since all args are passed on the stack, total_args_passed *
+  // Interpreter::stackElementSize is the space we need.
+
+  int extraspace = total_args_passed * Interpreter::stackElementSize;
+
+  __ mov(r13, sp);
+
+  // stack is aligned, keep it that way
+  extraspace = round_to(extraspace, 2*wordSize);
+
+  if (extraspace)
+    __ sub(sp, sp, extraspace);
+
+  // Now write the args into the outgoing interpreter space
+  for (int i = 0; i < total_args_passed; i++) {
+    if (sig_bt[i] == T_VOID) {
+      assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
+      continue;
+    }
+
+    // offset to start parameters
+    int st_off   = (total_args_passed - i - 1) * Interpreter::stackElementSize;
+    int next_off = st_off - Interpreter::stackElementSize;
+
+    // Say 4 args:
+    // i   st_off
+    // 0   32 T_LONG
+    // 1   24 T_VOID
+    // 2   16 T_OBJECT
+    // 3    8 T_BOOL
+    // -    0 return address
+    //
+    // However to make thing extra confusing. Because we can fit a long/double in
+    // a single slot on a 64 bt vm and it would be silly to break them up, the interpreter
+    // leaves one slot empty and only stores to a single slot. In this case the
+    // slot that is occupied is the T_VOID slot. See I said it was confusing.
+
+    VMReg r_1 = regs[i].first();
+    VMReg r_2 = regs[i].second();
+    if (!r_1->is_valid()) {
+      assert(!r_2->is_valid(), "");
+      continue;
+    }
+    if (r_1->is_stack()) {
+      // memory to memory use rscratch1
+      int ld_off = (r_1->reg2stack() * VMRegImpl::stack_slot_size
+                    + extraspace
+                    + words_pushed * wordSize);
+      if (!r_2->is_valid()) {
+        // sign extend??
+        __ ldrw(rscratch1, Address(sp, ld_off));
+        __ str(rscratch1, Address(sp, st_off));
+
+      } else {
+
+        __ ldr(rscratch1, Address(sp, ld_off));
+
+        // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
+        // T_DOUBLE and T_LONG use two slots in the interpreter
+        if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
+          // ld_off == LSW, ld_off+wordSize == MSW
+          // st_off == MSW, next_off == LSW
+          __ str(rscratch1, Address(sp, next_off));
+#ifdef ASSERT
+          // Overwrite the unused slot with known junk
+          __ mov(rscratch1, 0xdeadffffdeadaaaaul);
+          __ str(rscratch1, Address(sp, st_off));
+#endif /* ASSERT */
+        } else {
+          __ str(rscratch1, Address(sp, st_off));
+        }
+      }
+    } else if (r_1->is_Register()) {
+      Register r = r_1->as_Register();
+      if (!r_2->is_valid()) {
+        // must be only an int (or less ) so move only 32bits to slot
+        // why not sign extend??
+        __ str(r, Address(sp, st_off));
+      } else {
+        // Two VMREgs|OptoRegs can be T_OBJECT, T_ADDRESS, T_DOUBLE, T_LONG
+        // T_DOUBLE and T_LONG use two slots in the interpreter
+        if ( sig_bt[i] == T_LONG || sig_bt[i] == T_DOUBLE) {
+          // long/double in gpr
+#ifdef ASSERT
+          // Overwrite the unused slot with known junk
+          __ mov(rscratch1, 0xdeadffffdeadaaabul);
+          __ str(rscratch1, Address(sp, st_off));
+#endif /* ASSERT */
+          __ str(r, Address(sp, next_off));
+        } else {
+          __ str(r, Address(sp, st_off));
+        }
+      }
+    } else {
+      assert(r_1->is_FloatRegister(), "");
+      if (!r_2->is_valid()) {
+        // only a float use just part of the slot
+        __ strs(r_1->as_FloatRegister(), Address(sp, st_off));
+      } else {
+#ifdef ASSERT
+        // Overwrite the unused slot with known junk
+        __ mov(rscratch1, 0xdeadffffdeadaaacul);
+        __ str(rscratch1, Address(sp, st_off));
+#endif /* ASSERT */
+        __ strd(r_1->as_FloatRegister(), Address(sp, next_off));
+      }
+    }
+  }
+
+  __ mov(esp, sp); // Interp expects args on caller's expression stack
+
+  __ ldr(rscratch1, Address(rmethod, in_bytes(Method::interpreter_entry_offset())));
+  __ br(rscratch1);
+}
+
+
+static void gen_i2c_adapter(MacroAssembler *masm,
+                            int total_args_passed,
+                            int comp_args_on_stack,
+                            const BasicType *sig_bt,
+                            const VMRegPair *regs) {
+
+  // Note: r13 contains the senderSP on entry. We must preserve it since
+  // we may do a i2c -> c2i transition if we lose a race where compiled
+  // code goes non-entrant while we get args ready.
+
+  // In addition we use r13 to locate all the interpreter args because
+  // we must align the stack to 16 bytes.
+
+  // Adapters are frameless.
+
+  // An i2c adapter is frameless because the *caller* frame, which is
+  // interpreted, routinely repairs its own esp (from
+  // interpreter_frame_last_sp), even if a callee has modified the
+  // stack pointer.  It also recalculates and aligns sp.
+
+  // A c2i adapter is frameless because the *callee* frame, which is
+  // interpreted, routinely repairs its caller's sp (from sender_sp,
+  // which is set up via the senderSP register).
+
+  // In other words, if *either* the caller or callee is interpreted, we can
+  // get the stack pointer repaired after a call.
+
+  // This is why c2i and i2c adapters cannot be indefinitely composed.
+  // In particular, if a c2i adapter were to somehow call an i2c adapter,
+  // both caller and callee would be compiled methods, and neither would
+  // clean up the stack pointer changes performed by the two adapters.
+  // If this happens, control eventually transfers back to the compiled
+  // caller, but with an uncorrected stack, causing delayed havoc.
+
+  if (VerifyAdapterCalls &&
+      (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
+#if 0
+    // So, let's test for cascading c2i/i2c adapters right now.
+    //  assert(Interpreter::contains($return_addr) ||
+    //         StubRoutines::contains($return_addr),
+    //         "i2c adapter must return to an interpreter frame");
+    __ block_comment("verify_i2c { ");
+    Label L_ok;
+    if (Interpreter::code() != NULL)
+      range_check(masm, rax, r11,
+                  Interpreter::code()->code_start(), Interpreter::code()->code_end(),
+                  L_ok);
+    if (StubRoutines::code1() != NULL)
+      range_check(masm, rax, r11,
+                  StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
+                  L_ok);
+    if (StubRoutines::code2() != NULL)
+      range_check(masm, rax, r11,
+                  StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
+                  L_ok);
+    const char* msg = "i2c adapter must return to an interpreter frame";
+    __ block_comment(msg);
+    __ stop(msg);
+    __ bind(L_ok);
+    __ block_comment("} verify_i2ce ");
+#endif
+  }
+
+  // Cut-out for having no stack args.
+  int comp_words_on_stack = round_to(comp_args_on_stack*VMRegImpl::stack_slot_size, wordSize)>>LogBytesPerWord;
+  if (comp_args_on_stack) {
+    __ sub(rscratch1, sp, comp_words_on_stack * wordSize);
+    __ andr(sp, rscratch1, -16);
+  }
+
+  // Will jump to the compiled code just as if compiled code was doing it.
+  // Pre-load the register-jump target early, to schedule it better.
+  __ ldr(rscratch1, Address(rmethod, in_bytes(Method::from_compiled_offset())));
+
+  // Now generate the shuffle code.
+  for (int i = 0; i < total_args_passed; i++) {
+    if (sig_bt[i] == T_VOID) {
+      assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
+      continue;
+    }
+
+    // Pick up 0, 1 or 2 words from SP+offset.
+
+    assert(!regs[i].second()->is_valid() || regs[i].first()->next() == regs[i].second(),
+            "scrambled load targets?");
+    // Load in argument order going down.
+    int ld_off = (total_args_passed - i - 1)*Interpreter::stackElementSize;
+    // Point to interpreter value (vs. tag)
+    int next_off = ld_off - Interpreter::stackElementSize;
+    //
+    //
+    //
+    VMReg r_1 = regs[i].first();
+    VMReg r_2 = regs[i].second();
+    if (!r_1->is_valid()) {
+      assert(!r_2->is_valid(), "");
+      continue;
+    }
+    if (r_1->is_stack()) {
+      // Convert stack slot to an SP offset (+ wordSize to account for return address )
+      int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size;
+      if (!r_2->is_valid()) {
+        // sign extend???
+        __ ldrsw(rscratch2, Address(esp, ld_off));
+        __ str(rscratch2, Address(sp, st_off));
+      } else {
+        //
+        // We are using two optoregs. This can be either T_OBJECT,
+        // T_ADDRESS, T_LONG, or T_DOUBLE the interpreter allocates
+        // two slots but only uses one for thr T_LONG or T_DOUBLE case
+        // So we must adjust where to pick up the data to match the
+        // interpreter.
+        //
+        // Interpreter local[n] == MSW, local[n+1] == LSW however locals
+        // are accessed as negative so LSW is at LOW address
+
+        // ld_off is MSW so get LSW
+        const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
+                           next_off : ld_off;
+        __ ldr(rscratch2, Address(esp, offset));
+        // st_off is LSW (i.e. reg.first())
+        __ str(rscratch2, Address(sp, st_off));
+      }
+    } else if (r_1->is_Register()) {  // Register argument
+      Register r = r_1->as_Register();
+      if (r_2->is_valid()) {
+        //
+        // We are using two VMRegs. This can be either T_OBJECT,
+        // T_ADDRESS, T_LONG, or T_DOUBLE the interpreter allocates
+        // two slots but only uses one for thr T_LONG or T_DOUBLE case
+        // So we must adjust where to pick up the data to match the
+        // interpreter.
+
+        const int offset = (sig_bt[i]==T_LONG||sig_bt[i]==T_DOUBLE)?
+                           next_off : ld_off;
+
+        // this can be a misaligned move
+        __ ldr(r, Address(esp, offset));
+      } else {
+        // sign extend and use a full word?
+        __ ldrw(r, Address(esp, ld_off));
+      }
+    } else {
+      if (!r_2->is_valid()) {
+        __ ldrs(r_1->as_FloatRegister(), Address(esp, ld_off));
+      } else {
+        __ ldrd(r_1->as_FloatRegister(), Address(esp, next_off));
+      }
+    }
+  }
+
+  // 6243940 We might end up in handle_wrong_method if
+  // the callee is deoptimized as we race thru here. If that
+  // happens we don't want to take a safepoint because the
+  // caller frame will look interpreted and arguments are now
+  // "compiled" so it is much better to make this transition
+  // invisible to the stack walking code. Unfortunately if
+  // we try and find the callee by normal means a safepoint
+  // is possible. So we stash the desired callee in the thread
+  // and the vm will find there should this case occur.
+
+  __ str(rmethod, Address(rthread, JavaThread::callee_target_offset()));
+
+  __ br(rscratch1);
+}
+
+#ifdef BUILTIN_SIM
+static void generate_i2c_adapter_name(char *result, int total_args_passed, const BasicType *sig_bt)
+{
+  strcpy(result, "i2c(");
+  int idx = 4;
+  for (int i = 0; i < total_args_passed; i++) {
+    switch(sig_bt[i]) {
+    case T_BOOLEAN:
+      result[idx++] = 'Z';
+      break;
+    case T_CHAR:
+      result[idx++] = 'C';
+      break;
+    case T_FLOAT:
+      result[idx++] = 'F';
+      break;
+    case T_DOUBLE:
+      assert((i < (total_args_passed - 1)) && (sig_bt[i+1] == T_VOID),
+             "double must be followed by void");
+      i++;
+      result[idx++] = 'D';
+      break;
+    case T_BYTE:
+      result[idx++] = 'B';
+      break;
+    case T_SHORT:
+      result[idx++] = 'S';
+      break;
+    case T_INT:
+      result[idx++] = 'I';
+      break;
+    case T_LONG:
+      assert((i < (total_args_passed - 1)) && (sig_bt[i+1] == T_VOID),
+             "long must be followed by void");
+      i++;
+      result[idx++] = 'L';
+      break;
+    case T_OBJECT:
+      result[idx++] = 'O';
+      break;
+    case T_ARRAY:
+      result[idx++] = '[';
+      break;
+    case T_ADDRESS:
+      result[idx++] = 'P';
+      break;
+    case T_NARROWOOP:
+      result[idx++] = 'N';
+      break;
+    case T_METADATA:
+      result[idx++] = 'M';
+      break;
+    case T_NARROWKLASS:
+      result[idx++] = 'K';
+      break;
+    default:
+      result[idx++] = '?';
+      break;
+    }
+  }
+  result[idx++] = ')';
+  result[idx] = '\0';
+}
+#endif
+
+// ---------------------------------------------------------------
+AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
+                                                            int total_args_passed,
+                                                            int comp_args_on_stack,
+                                                            const BasicType *sig_bt,
+                                                            const VMRegPair *regs,
+                                                            AdapterFingerPrint* fingerprint) {
+  address i2c_entry = __ pc();
+#ifdef BUILTIN_SIM
+  char *name = NULL;
+  AArch64Simulator *sim = NULL;
+  size_t len = 65536;
+  if (NotifySimulator) {
+    name = NEW_C_HEAP_ARRAY(char, len, mtInternal);
+  }
+
+  if (name) {
+    generate_i2c_adapter_name(name, total_args_passed, sig_bt);
+    sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
+    sim->notifyCompile(name, i2c_entry);
+  }
+#endif
+  gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
+
+  address c2i_unverified_entry = __ pc();
+  Label skip_fixup;
+
+  Label ok;
+
+  Register holder = rscratch2;
+  Register receiver = j_rarg0;
+  Register tmp = r10;  // A call-clobbered register not used for arg passing
+
+  // -------------------------------------------------------------------------
+  // Generate a C2I adapter.  On entry we know rmethod holds the Method* during calls
+  // to the interpreter.  The args start out packed in the compiled layout.  They
+  // need to be unpacked into the interpreter layout.  This will almost always
+  // require some stack space.  We grow the current (compiled) stack, then repack
+  // the args.  We  finally end in a jump to the generic interpreter entry point.
+  // On exit from the interpreter, the interpreter will restore our SP (lest the
+  // compiled code, which relys solely on SP and not FP, get sick).
+
+  {
+    __ block_comment("c2i_unverified_entry {");
+    __ load_klass(rscratch1, receiver);
+    __ ldr(tmp, Address(holder, CompiledICHolder::holder_klass_offset()));
+    __ cmp(rscratch1, tmp);
+    __ ldr(rmethod, Address(holder, CompiledICHolder::holder_method_offset()));
+    __ br(Assembler::EQ, ok);
+    __ far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
+
+    __ bind(ok);
+    // Method might have been compiled since the call site was patched to
+    // interpreted; if that is the case treat it as a miss so we can get
+    // the call site corrected.
+    __ ldr(rscratch1, Address(rmethod, in_bytes(Method::code_offset())));
+    __ cbz(rscratch1, skip_fixup);
+    __ far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
+    __ block_comment("} c2i_unverified_entry");
+  }
+
+  address c2i_entry = __ pc();
+
+#ifdef BUILTIN_SIM
+  if (name) {
+    name[0] = 'c';
+    name[2] = 'i';
+    sim->notifyCompile(name, c2i_entry);
+    FREE_C_HEAP_ARRAY(char, name, mtInternal);
+  }
+#endif
+
+  gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
+
+  __ flush();
+  return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry);
+}
+
+int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
+                                         VMRegPair *regs,
+                                         VMRegPair *regs2,
+                                         int total_args_passed) {
+  assert(regs2 == NULL, "not needed on AArch64");
+
+// We return the amount of VMRegImpl stack slots we need to reserve for all
+// the arguments NOT counting out_preserve_stack_slots.
+
+    static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
+      c_rarg0, c_rarg1, c_rarg2, c_rarg3, c_rarg4, c_rarg5,  c_rarg6,  c_rarg7
+    };
+    static const FloatRegister FP_ArgReg[Argument::n_float_register_parameters_c] = {
+      c_farg0, c_farg1, c_farg2, c_farg3,
+      c_farg4, c_farg5, c_farg6, c_farg7
+    };
+
+    uint int_args = 0;
+    uint fp_args = 0;
+    uint stk_args = 0; // inc by 2 each time
+
+    for (int i = 0; i < total_args_passed; i++) {
+      switch (sig_bt[i]) {
+      case T_BOOLEAN:
+      case T_CHAR:
+      case T_BYTE:
+      case T_SHORT:
+      case T_INT:
+        if (int_args < Argument::n_int_register_parameters_c) {
+          regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
+        } else {
+          regs[i].set1(VMRegImpl::stack2reg(stk_args));
+          stk_args += 2;
+        }
+        break;
+      case T_LONG:
+        assert(sig_bt[i + 1] == T_VOID, "expecting half");
+        // fall through
+      case T_OBJECT:
+      case T_ARRAY:
+      case T_ADDRESS:
+      case T_METADATA:
+        if (int_args < Argument::n_int_register_parameters_c) {
+          regs[i].set2(INT_ArgReg[int_args++]->as_VMReg());
+        } else {
+          regs[i].set2(VMRegImpl::stack2reg(stk_args));
+          stk_args += 2;
+        }
+        break;
+      case T_FLOAT:
+        if (fp_args < Argument::n_float_register_parameters_c) {
+          regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
+        } else {
+          regs[i].set1(VMRegImpl::stack2reg(stk_args));
+          stk_args += 2;
+        }
+        break;
+      case T_DOUBLE:
+        assert(sig_bt[i + 1] == T_VOID, "expecting half");
+        if (fp_args < Argument::n_float_register_parameters_c) {
+          regs[i].set2(FP_ArgReg[fp_args++]->as_VMReg());
+        } else {
+          regs[i].set2(VMRegImpl::stack2reg(stk_args));
+          stk_args += 2;
+        }
+        break;
+      case T_VOID: // Halves of longs and doubles
+        assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
+        regs[i].set_bad();
+        break;
+      default:
+        ShouldNotReachHere();
+        break;
+      }
+    }
+
+  return stk_args;
+}
+
+// On 64 bit we will store integer like items to the stack as
+// 64 bits items (sparc abi) even though java would only store
+// 32bits for a parameter. On 32bit it will simply be 32 bits
+// So this routine will do 32->32 on 32bit and 32->64 on 64bit
+static void move32_64(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
+  if (src.first()->is_stack()) {
+    if (dst.first()->is_stack()) {
+      // stack to stack
+      __ ldr(rscratch1, Address(rfp, reg2offset_in(src.first())));
+      __ str(rscratch1, Address(sp, reg2offset_out(dst.first())));
+    } else {
+      // stack to reg
+      __ ldrsw(dst.first()->as_Register(), Address(rfp, reg2offset_in(src.first())));
+    }
+  } else if (dst.first()->is_stack()) {
+    // reg to stack
+    // Do we really have to sign extend???
+    // __ movslq(src.first()->as_Register(), src.first()->as_Register());
+    __ str(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
+  } else {
+    if (dst.first() != src.first()) {
+      __ sxtw(dst.first()->as_Register(), src.first()->as_Register());
+    }
+  }
+}
+
+// An oop arg. Must pass a handle not the oop itself
+static void object_move(MacroAssembler* masm,
+                        OopMap* map,
+                        int oop_handle_offset,
+                        int framesize_in_slots,
+                        VMRegPair src,
+                        VMRegPair dst,
+                        bool is_receiver,
+                        int* receiver_offset) {
+
+  // must pass a handle. First figure out the location we use as a handle
+
+  Register rHandle = dst.first()->is_stack() ? rscratch2 : dst.first()->as_Register();
+
+  // See if oop is NULL if it is we need no handle
+
+  if (src.first()->is_stack()) {
+
+    // Oop is already on the stack as an argument
+    int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
+    map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
+    if (is_receiver) {
+      *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
+    }
+
+    __ ldr(rscratch1, Address(rfp, reg2offset_in(src.first())));
+    __ lea(rHandle, Address(rfp, reg2offset_in(src.first())));
+    // conditionally move a NULL
+    __ cmp(rscratch1, zr);
+    __ csel(rHandle, zr, rHandle, Assembler::EQ);
+  } else {
+
+    // Oop is in an a register we must store it to the space we reserve
+    // on the stack for oop_handles and pass a handle if oop is non-NULL
+
+    const Register rOop = src.first()->as_Register();
+    int oop_slot;
+    if (rOop == j_rarg0)
+      oop_slot = 0;
+    else if (rOop == j_rarg1)
+      oop_slot = 1;
+    else if (rOop == j_rarg2)
+      oop_slot = 2;
+    else if (rOop == j_rarg3)
+      oop_slot = 3;
+    else if (rOop == j_rarg4)
+      oop_slot = 4;
+    else if (rOop == j_rarg5)
+      oop_slot = 5;
+    else if (rOop == j_rarg6)
+      oop_slot = 6;
+    else {
+      assert(rOop == j_rarg7, "wrong register");
+      oop_slot = 7;
+    }
+
+    oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
+    int offset = oop_slot*VMRegImpl::stack_slot_size;
+
+    map->set_oop(VMRegImpl::stack2reg(oop_slot));
+    // Store oop in handle area, may be NULL
+    __ str(rOop, Address(sp, offset));
+    if (is_receiver) {
+      *receiver_offset = offset;
+    }
+
+    __ cmp(rOop, zr);
+    __ lea(rHandle, Address(sp, offset));
+    // conditionally move a NULL
+    __ csel(rHandle, zr, rHandle, Assembler::EQ);
+  }
+
+  // If arg is on the stack then place it otherwise it is already in correct reg.
+  if (dst.first()->is_stack()) {
+    __ str(rHandle, Address(sp, reg2offset_out(dst.first())));
+  }
+}
+
+// A float arg may have to do float reg int reg conversion
+static void float_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
+  if (src.first() != dst.first()) {
+    if (src.is_single_phys_reg() && dst.is_single_phys_reg())
+      __ fmovs(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
+    else
+      ShouldNotReachHere();
+  }
+}
+
+// A long move
+static void long_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
+  if (src.first()->is_stack()) {
+    if (dst.first()->is_stack()) {
+      // stack to stack
+      __ ldr(rscratch1, Address(rfp, reg2offset_in(src.first())));
+      __ str(rscratch1, Address(sp, reg2offset_out(dst.first())));
+    } else {
+      // stack to reg
+      __ ldr(dst.first()->as_Register(), Address(rfp, reg2offset_in(src.first())));
+    }
+  } else if (dst.first()->is_stack()) {
+    // reg to stack
+    // Do we really have to sign extend???
+    // __ movslq(src.first()->as_Register(), src.first()->as_Register());
+    __ str(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
+  } else {
+    if (dst.first() != src.first()) {
+      __ mov(dst.first()->as_Register(), src.first()->as_Register());
+    }
+  }
+}
+
+
+// A double move
+static void double_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
+  if (src.first() != dst.first()) {
+    if (src.is_single_phys_reg() && dst.is_single_phys_reg())
+      __ fmovd(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
+    else
+      ShouldNotReachHere();
+  }
+}
+
+
+void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
+  // We always ignore the frame_slots arg and just use the space just below frame pointer
+  // which by this time is free to use
+  switch (ret_type) {
+  case T_FLOAT:
+    __ strs(v0, Address(rfp, -wordSize));
+    break;
+  case T_DOUBLE:
+    __ strd(v0, Address(rfp, -wordSize));
+    break;
+  case T_VOID:  break;
+  default: {
+    __ str(r0, Address(rfp, -wordSize));
+    }
+  }
+}
+
+void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
+  // We always ignore the frame_slots arg and just use the space just below frame pointer
+  // which by this time is free to use
+  switch (ret_type) {
+  case T_FLOAT:
+    __ ldrs(v0, Address(rfp, -wordSize));
+    break;
+  case T_DOUBLE:
+    __ ldrd(v0, Address(rfp, -wordSize));
+    break;
+  case T_VOID:  break;
+  default: {
+    __ ldr(r0, Address(rfp, -wordSize));
+    }
+  }
+}
+static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
+  RegSet x;
+  for ( int i = first_arg ; i < arg_count ; i++ ) {
+    if (args[i].first()->is_Register()) {
+      x = x + args[i].first()->as_Register();
+    } else if (args[i].first()->is_FloatRegister()) {
+      __ strd(args[i].first()->as_FloatRegister(), Address(__ pre(sp, -2 * wordSize)));
+    }
+  }
+  __ push(x, sp);
+}
+
+static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
+  RegSet x;
+  for ( int i = first_arg ; i < arg_count ; i++ ) {
+    if (args[i].first()->is_Register()) {
+      x = x + args[i].first()->as_Register();
+    } else {
+      ;
+    }
+  }
+  __ pop(x, sp);
+  for ( int i = first_arg ; i < arg_count ; i++ ) {
+    if (args[i].first()->is_Register()) {
+      ;
+    } else if (args[i].first()->is_FloatRegister()) {
+      __ ldrd(args[i].first()->as_FloatRegister(), Address(__ post(sp, 2 * wordSize)));
+    }
+  }
+}
+
+
+// Check GC_locker::needs_gc and enter the runtime if it's true.  This
+// keeps a new JNI critical region from starting until a GC has been
+// forced.  Save down any oops in registers and describe them in an
+// OopMap.
+static void check_needs_gc_for_critical_native(MacroAssembler* masm,
+                                               int stack_slots,
+                                               int total_c_args,
+                                               int total_in_args,
+                                               int arg_save_area,
+                                               OopMapSet* oop_maps,
+                                               VMRegPair* in_regs,
+                                               BasicType* in_sig_bt) { Unimplemented(); }
+
+// Unpack an array argument into a pointer to the body and the length
+// if the array is non-null, otherwise pass 0 for both.
+static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type, VMRegPair body_arg, VMRegPair length_arg) { Unimplemented(); }
+
+
+class ComputeMoveOrder: public StackObj {
+  class MoveOperation: public ResourceObj {
+    friend class ComputeMoveOrder;
+   private:
+    VMRegPair        _src;
+    VMRegPair        _dst;
+    int              _src_index;
+    int              _dst_index;
+    bool             _processed;
+    MoveOperation*  _next;
+    MoveOperation*  _prev;
+
+    static int get_id(VMRegPair r) { Unimplemented(); return 0; }
+
+   public:
+    MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
+      _src(src)
+    , _src_index(src_index)
+    , _dst(dst)
+    , _dst_index(dst_index)
+    , _next(NULL)
+    , _prev(NULL)
+    , _processed(false) { Unimplemented(); }
+
+    VMRegPair src() const              { Unimplemented(); return _src; }
+    int src_id() const                 { Unimplemented(); return 0; }
+    int src_index() const              { Unimplemented(); return 0; }
+    VMRegPair dst() const              { Unimplemented(); return _src; }
+    void set_dst(int i, VMRegPair dst) { Unimplemented(); }
+    int dst_index() const              { Unimplemented(); return 0; }
+    int dst_id() const                 { Unimplemented(); return 0; }
+    MoveOperation* next() const        { Unimplemented(); return 0; }
+    MoveOperation* prev() const        { Unimplemented(); return 0; }
+    void set_processed()               { Unimplemented(); }
+    bool is_processed() const          { Unimplemented(); return 0; }
+
+    // insert
+    void break_cycle(VMRegPair temp_register) { Unimplemented(); }
+
+    void link(GrowableArray<MoveOperation*>& killer) { Unimplemented(); }
+  };
+
+ private:
+  GrowableArray<MoveOperation*> edges;
+
+ public:
+  ComputeMoveOrder(int total_in_args, VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
+                    BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) { Unimplemented(); }
+
+  // Collected all the move operations
+  void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) { Unimplemented(); }
+
+  // Walk the edges breaking cycles between moves.  The result list
+  // can be walked in order to produce the proper set of loads
+  GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) { Unimplemented(); return 0; }
+};
+
+
+static void rt_call(MacroAssembler* masm, address dest, int gpargs, int fpargs, int type) {
+  CodeBlob *cb = CodeCache::find_blob(dest);
+  if (cb) {
+    __ far_call(RuntimeAddress(dest));
+  } else {
+    assert((unsigned)gpargs < 256, "eek!");
+    assert((unsigned)fpargs < 32, "eek!");
+    __ lea(rscratch1, RuntimeAddress(dest));
+    __ mov(rscratch2, (gpargs << 6) | (fpargs << 2) | type);
+    __ blrt(rscratch1, rscratch2);
+    __ maybe_isb();
+  }
+}
+
+static void verify_oop_args(MacroAssembler* masm,
+                            methodHandle method,
+                            const BasicType* sig_bt,
+                            const VMRegPair* regs) {
+  Register temp_reg = r19;  // not part of any compiled calling seq
+  if (VerifyOops) {
+    for (int i = 0; i < method->size_of_parameters(); i++) {
+      if (sig_bt[i] == T_OBJECT ||
+          sig_bt[i] == T_ARRAY) {
+        VMReg r = regs[i].first();
+        assert(r->is_valid(), "bad oop arg");
+        if (r->is_stack()) {
+          __ ldr(temp_reg, Address(sp, r->reg2stack() * VMRegImpl::stack_slot_size));
+          __ verify_oop(temp_reg);
+        } else {
+          __ verify_oop(r->as_Register());
+        }
+      }
+    }
+  }
+}
+
+static void gen_special_dispatch(MacroAssembler* masm,
+                                 methodHandle method,
+                                 const BasicType* sig_bt,
+                                 const VMRegPair* regs) {
+  verify_oop_args(masm, method, sig_bt, regs);
+  vmIntrinsics::ID iid = method->intrinsic_id();
+
+  // Now write the args into the outgoing interpreter space
+  bool     has_receiver   = false;
+  Register receiver_reg   = noreg;
+  int      member_arg_pos = -1;
+  Register member_reg     = noreg;
+  int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
+  if (ref_kind != 0) {
+    member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
+    member_reg = r19;  // known to be free at this point
+    has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
+  } else if (iid == vmIntrinsics::_invokeBasic) {
+    has_receiver = true;
+  } else {
+    fatal(err_msg_res("unexpected intrinsic id %d", iid));
+  }
+
+  if (member_reg != noreg) {
+    // Load the member_arg into register, if necessary.
+    SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
+    VMReg r = regs[member_arg_pos].first();
+    if (r->is_stack()) {
+      __ ldr(member_reg, Address(sp, r->reg2stack() * VMRegImpl::stack_slot_size));
+    } else {
+      // no data motion is needed
+      member_reg = r->as_Register();
+    }
+  }
+
+  if (has_receiver) {
+    // Make sure the receiver is loaded into a register.
+    assert(method->size_of_parameters() > 0, "oob");
+    assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
+    VMReg r = regs[0].first();
+    assert(r->is_valid(), "bad receiver arg");
+    if (r->is_stack()) {
+      // Porting note:  This assumes that compiled calling conventions always
+      // pass the receiver oop in a register.  If this is not true on some
+      // platform, pick a temp and load the receiver from stack.
+      fatal("receiver always in a register");
+      receiver_reg = r2;  // known to be free at this point
+      __ ldr(receiver_reg, Address(sp, r->reg2stack() * VMRegImpl::stack_slot_size));
+    } else {
+      // no data motion is needed
+      receiver_reg = r->as_Register();
+    }
+  }
+
+  // Figure out which address we are really jumping to:
+  MethodHandles::generate_method_handle_dispatch(masm, iid,
+                                                 receiver_reg, member_reg, /*for_compiler_entry:*/ true);
+}
+
+// ---------------------------------------------------------------------------
+// Generate a native wrapper for a given method.  The method takes arguments
+// in the Java compiled code convention, marshals them to the native
+// convention (handlizes oops, etc), transitions to native, makes the call,
+// returns to java state (possibly blocking), unhandlizes any result and
+// returns.
+//
+// Critical native functions are a shorthand for the use of
+// GetPrimtiveArrayCritical and disallow the use of any other JNI
+// functions.  The wrapper is expected to unpack the arguments before
+// passing them to the callee and perform checks before and after the
+// native call to ensure that they GC_locker
+// lock_critical/unlock_critical semantics are followed.  Some other
+// parts of JNI setup are skipped like the tear down of the JNI handle
+// block and the check for pending exceptions it's impossible for them
+// to be thrown.
+//
+// They are roughly structured like this:
+//    if (GC_locker::needs_gc())
+//      SharedRuntime::block_for_jni_critical();
+//    tranistion to thread_in_native
+//    unpack arrray arguments and call native entry point
+//    check for safepoint in progress
+//    check if any thread suspend flags are set
+//      call into JVM and possible unlock the JNI critical
+//      if a GC was suppressed while in the critical native.
+//    transition back to thread_in_Java
+//    return to caller
+//
+nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
+                                                methodHandle method,
+                                                int compile_id,
+                                                BasicType* in_sig_bt,
+                                                VMRegPair* in_regs,
+                                                BasicType ret_type) {
+#ifdef BUILTIN_SIM
+  if (NotifySimulator) {
+    // Names are up to 65536 chars long.  UTF8-coded strings are up to
+    // 3 bytes per character.  We concatenate three such strings.
+    // Yes, I know this is ridiculous, but it's debug code and glibc
+    // allocates large arrays very efficiently.
+    size_t len = (65536 * 3) * 3;
+    char *name = new char[len];
+
+    strncpy(name, method()->method_holder()->name()->as_utf8(), len);
+    strncat(name, ".", len);
+    strncat(name, method()->name()->as_utf8(), len);
+    strncat(name, method()->signature()->as_utf8(), len);
+    AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck)->notifyCompile(name, __ pc());
+    delete[] name;
+  }
+#endif
+
+  if (method->is_method_handle_intrinsic()) {
+    vmIntrinsics::ID iid = method->intrinsic_id();
+    intptr_t start = (intptr_t)__ pc();
+    int vep_offset = ((intptr_t)__ pc()) - start;
+
+    // First instruction must be a nop as it may need to be patched on deoptimisation
+    __ nop();
+    gen_special_dispatch(masm,
+                         method,
+                         in_sig_bt,
+                         in_regs);
+    int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
+    __ flush();
+    int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
+    return nmethod::new_native_nmethod(method,
+                                       compile_id,
+                                       masm->code(),
+                                       vep_offset,
+                                       frame_complete,
+                                       stack_slots / VMRegImpl::slots_per_word,
+                                       in_ByteSize(-1),
+                                       in_ByteSize(-1),
+                                       (OopMapSet*)NULL);
+  }
+  bool is_critical_native = true;
+  address native_func = method->critical_native_function();
+  if (native_func == NULL) {
+    native_func = method->native_function();
+    is_critical_native = false;
+  }
+  assert(native_func != NULL, "must have function");
+
+  // An OopMap for lock (and class if static)
+  OopMapSet *oop_maps = new OopMapSet();
+  intptr_t start = (intptr_t)__ pc();
+
+  // We have received a description of where all the java arg are located
+  // on entry to the wrapper. We need to convert these args to where
+  // the jni function will expect them. To figure out where they go
+  // we convert the java signature to a C signature by inserting
+  // the hidden arguments as arg[0] and possibly arg[1] (static method)
+
+  const int total_in_args = method->size_of_parameters();
+  int total_c_args = total_in_args;
+  if (!is_critical_native) {
+    total_c_args += 1;
+    if (method->is_static()) {
+      total_c_args++;
+    }
+  } else {
+    for (int i = 0; i < total_in_args; i++) {
+      if (in_sig_bt[i] == T_ARRAY) {
+        total_c_args++;
+      }
+    }
+  }
+
+  BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
+  VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
+  BasicType* in_elem_bt = NULL;
+
+  int argc = 0;
+  if (!is_critical_native) {
+    out_sig_bt[argc++] = T_ADDRESS;
+    if (method->is_static()) {
+      out_sig_bt[argc++] = T_OBJECT;
+    }
+
+    for (int i = 0; i < total_in_args ; i++ ) {
+      out_sig_bt[argc++] = in_sig_bt[i];
+    }
+  } else {
+    Thread* THREAD = Thread::current();
+    in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args);
+    SignatureStream ss(method->signature());
+    for (int i = 0; i < total_in_args ; i++ ) {
+      if (in_sig_bt[i] == T_ARRAY) {
+        // Arrays are passed as int, elem* pair
+        out_sig_bt[argc++] = T_INT;
+        out_sig_bt[argc++] = T_ADDRESS;
+        Symbol* atype = ss.as_symbol(CHECK_NULL);
+        const char* at = atype->as_C_string();
+        if (strlen(at) == 2) {
+          assert(at[0] == '[', "must be");
+          switch (at[1]) {
+            case 'B': in_elem_bt[i]  = T_BYTE; break;
+            case 'C': in_elem_bt[i]  = T_CHAR; break;
+            case 'D': in_elem_bt[i]  = T_DOUBLE; break;
+            case 'F': in_elem_bt[i]  = T_FLOAT; break;
+            case 'I': in_elem_bt[i]  = T_INT; break;
+            case 'J': in_elem_bt[i]  = T_LONG; break;
+            case 'S': in_elem_bt[i]  = T_SHORT; break;
+            case 'Z': in_elem_bt[i]  = T_BOOLEAN; break;
+            default: ShouldNotReachHere();
+          }
+        }
+      } else {
+        out_sig_bt[argc++] = in_sig_bt[i];
+        in_elem_bt[i] = T_VOID;
+      }
+      if (in_sig_bt[i] != T_VOID) {
+        assert(in_sig_bt[i] == ss.type(), "must match");
+        ss.next();
+      }
+    }
+  }
+
+  // Now figure out where the args must be stored and how much stack space
+  // they require.
+  int out_arg_slots;
+  out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
+
+  // Compute framesize for the wrapper.  We need to handlize all oops in
+  // incoming registers
+
+  // Calculate the total number of stack slots we will need.
+
+  // First count the abi requirement plus all of the outgoing args
+  int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
+
+  // Now the space for the inbound oop handle area
+  int total_save_slots = 8 * VMRegImpl::slots_per_word;  // 8 arguments passed in registers
+  if (is_critical_native) {
+    // Critical natives may have to call out so they need a save area
+    // for register arguments.
+    int double_slots = 0;
+    int single_slots = 0;
+    for ( int i = 0; i < total_in_args; i++) {
+      if (in_regs[i].first()->is_Register()) {
+        const Register reg = in_regs[i].first()->as_Register();
+        switch (in_sig_bt[i]) {
+          case T_BOOLEAN:
+          case T_BYTE:
+          case T_SHORT:
+          case T_CHAR:
+          case T_INT:  single_slots++; break;
+          case T_ARRAY:  // specific to LP64 (7145024)
+          case T_LONG: double_slots++; break;
+          default:  ShouldNotReachHere();
+        }
+      } else if (in_regs[i].first()->is_FloatRegister()) {
+        ShouldNotReachHere();
+      }
+    }
+    total_save_slots = double_slots * 2 + single_slots;
+    // align the save area
+    if (double_slots != 0) {
+      stack_slots = round_to(stack_slots, 2);
+    }
+  }
+
+  int oop_handle_offset = stack_slots;
+  stack_slots += total_save_slots;
+
+  // Now any space we need for handlizing a klass if static method
+
+  int klass_slot_offset = 0;
+  int klass_offset = -1;
+  int lock_slot_offset = 0;
+  bool is_static = false;
+
+  if (method->is_static()) {
+    klass_slot_offset = stack_slots;
+    stack_slots += VMRegImpl::slots_per_word;
+    klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
+    is_static = true;
+  }
+
+  // Plus a lock if needed
+
+  if (method->is_synchronized()) {
+    lock_slot_offset = stack_slots;
+    stack_slots += VMRegImpl::slots_per_word;
+  }
+
+  // Now a place (+2) to save return values or temp during shuffling
+  // + 4 for return address (which we own) and saved rfp
+  stack_slots += 6;
+
+  // Ok The space we have allocated will look like:
+  //
+  //
+  // FP-> |                     |
+  //      |---------------------|
+  //      | 2 slots for moves   |
+  //      |---------------------|
+  //      | lock box (if sync)  |
+  //      |---------------------| <- lock_slot_offset
+  //      | klass (if static)   |
+  //      |---------------------| <- klass_slot_offset
+  //      | oopHandle area      |
+  //      |---------------------| <- oop_handle_offset (8 java arg registers)
+  //      | outbound memory     |
+  //      | based arguments     |
+  //      |                     |
+  //      |---------------------|
+  //      |                     |
+  // SP-> | out_preserved_slots |
+  //
+  //
+
+
+  // Now compute actual number of stack words we need rounding to make
+  // stack properly aligned.
+  stack_slots = round_to(stack_slots, StackAlignmentInSlots);
+
+  int stack_size = stack_slots * VMRegImpl::stack_slot_size;
+
+  // First thing make an ic check to see if we should even be here
+
+  // We are free to use all registers as temps without saving them and
+  // restoring them except rfp. rfp is the only callee save register
+  // as far as the interpreter and the compiler(s) are concerned.
+
+
+  const Register ic_reg = rscratch2;
+  const Register receiver = j_rarg0;
+
+  Label hit;
+  Label exception_pending;
+
+  assert_different_registers(ic_reg, receiver, rscratch1);
+  __ verify_oop(receiver);
+  __ cmp_klass(receiver, ic_reg, rscratch1);
+  __ br(Assembler::EQ, hit);
+
+  __ far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
+
+  // Verified entry point must be aligned
+  __ align(8);
+
+  __ bind(hit);
+
+  int vep_offset = ((intptr_t)__ pc()) - start;
+
+  // Generate stack overflow check
+
+  // If we have to make this method not-entrant we'll overwrite its
+  // first instruction with a jump.  For this action to be legal we
+  // must ensure that this first instruction is a B, BL, NOP, BKPT,
+  // SVC, HVC, or SMC.  Make it a NOP.
+  __ nop();
+
+  if (UseStackBanging) {
+    __ bang_stack_with_offset(StackShadowPages*os::vm_page_size());
+  } else {
+    Unimplemented();
+  }
+
+  // Generate a new frame for the wrapper.
+  __ enter();
+  // -2 because return address is already present and so is saved rfp
+  __ sub(sp, sp, stack_size - 2*wordSize);
+
+  // Frame is now completed as far as size and linkage.
+  int frame_complete = ((intptr_t)__ pc()) - start;
+
+  // record entry into native wrapper code
+  if (NotifySimulator) {
+    __ notify(Assembler::method_entry);
+  }
+
+  // We use r20 as the oop handle for the receiver/klass
+  // It is callee save so it survives the call to native
+
+  const Register oop_handle_reg = r20;
+
+  if (is_critical_native) {
+    check_needs_gc_for_critical_native(masm, stack_slots, total_c_args, total_in_args,
+                                       oop_handle_offset, oop_maps, in_regs, in_sig_bt);
+  }
+
+  //
+  // We immediately shuffle the arguments so that any vm call we have to
+  // make from here on out (sync slow path, jvmti, etc.) we will have
+  // captured the oops from our caller and have a valid oopMap for
+  // them.
+
+  // -----------------
+  // The Grand Shuffle
+
+  // The Java calling convention is either equal (linux) or denser (win64) than the
+  // c calling convention. However the because of the jni_env argument the c calling
+  // convention always has at least one more (and two for static) arguments than Java.
+  // Therefore if we move the args from java -> c backwards then we will never have
+  // a register->register conflict and we don't have to build a dependency graph
+  // and figure out how to break any cycles.
+  //
+
+  // Record esp-based slot for receiver on stack for non-static methods
+  int receiver_offset = -1;
+
+  // This is a trick. We double the stack slots so we can claim
+  // the oops in the caller's frame. Since we are sure to have
+  // more args than the caller doubling is enough to make
+  // sure we can capture all the incoming oop args from the
+  // caller.
+  //
+  OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
+
+  // Mark location of rfp (someday)
+  // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rfp));
+
+
+  int float_args = 0;
+  int int_args = 0;
+
+#ifdef ASSERT
+  bool reg_destroyed[RegisterImpl::number_of_registers];
+  bool freg_destroyed[FloatRegisterImpl::number_of_registers];
+  for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
+    reg_destroyed[r] = false;
+  }
+  for ( int f = 0 ; f < FloatRegisterImpl::number_of_registers ; f++ ) {
+    freg_destroyed[f] = false;
+  }
+
+#endif /* ASSERT */
+
+  // This may iterate in two different directions depending on the
+  // kind of native it is.  The reason is that for regular JNI natives
+  // the incoming and outgoing registers are offset upwards and for
+  // critical natives they are offset down.
+  GrowableArray<int> arg_order(2 * total_in_args);
+  VMRegPair tmp_vmreg;
+  tmp_vmreg.set1(r19->as_VMReg());
+
+  if (!is_critical_native) {
+    for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
+      arg_order.push(i);
+      arg_order.push(c_arg);
+    }
+  } else {
+    // Compute a valid move order, using tmp_vmreg to break any cycles
+    ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg);
+  }
+
+  int temploc = -1;
+  for (int ai = 0; ai < arg_order.length(); ai += 2) {
+    int i = arg_order.at(ai);
+    int c_arg = arg_order.at(ai + 1);
+    __ block_comment(err_msg("move %d -> %d", i, c_arg));
+    if (c_arg == -1) {
+      assert(is_critical_native, "should only be required for critical natives");
+      // This arg needs to be moved to a temporary
+      __ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register());
+      in_regs[i] = tmp_vmreg;
+      temploc = i;
+      continue;
+    } else if (i == -1) {
+      assert(is_critical_native, "should only be required for critical natives");
+      // Read from the temporary location
+      assert(temploc != -1, "must be valid");
+      i = temploc;
+      temploc = -1;
+    }
+#ifdef ASSERT
+    if (in_regs[i].first()->is_Register()) {
+      assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
+    } else if (in_regs[i].first()->is_FloatRegister()) {
+      assert(!freg_destroyed[in_regs[i].first()->as_FloatRegister()->encoding()], "destroyed reg!");
+    }
+    if (out_regs[c_arg].first()->is_Register()) {
+      reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
+    } else if (out_regs[c_arg].first()->is_FloatRegister()) {
+      freg_destroyed[out_regs[c_arg].first()->as_FloatRegister()->encoding()] = true;
+    }
+#endif /* ASSERT */
+    switch (in_sig_bt[i]) {
+      case T_ARRAY:
+        if (is_critical_native) {
+          unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]);
+          c_arg++;
+#ifdef ASSERT
+          if (out_regs[c_arg].first()->is_Register()) {
+            reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
+          } else if (out_regs[c_arg].first()->is_FloatRegister()) {
+            freg_destroyed[out_regs[c_arg].first()->as_FloatRegister()->encoding()] = true;
+          }
+#endif
+          int_args++;
+          break;
+        }
+      case T_OBJECT:
+        assert(!is_critical_native, "no oop arguments");
+        object_move(masm, map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
+                    ((i == 0) && (!is_static)),
+                    &receiver_offset);
+        int_args++;
+        break;
+      case T_VOID:
+        break;
+
+      case T_FLOAT:
+        float_move(masm, in_regs[i], out_regs[c_arg]);
+        float_args++;
+        break;
+
+      case T_DOUBLE:
+        assert( i + 1 < total_in_args &&
+                in_sig_bt[i + 1] == T_VOID &&
+                out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
+        double_move(masm, in_regs[i], out_regs[c_arg]);
+        float_args++;
+        break;
+
+      case T_LONG :
+        long_move(masm, in_regs[i], out_regs[c_arg]);
+        int_args++;
+        break;
+
+      case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
+
+      default:
+        move32_64(masm, in_regs[i], out_regs[c_arg]);
+        int_args++;
+    }
+  }
+
+  // point c_arg at the first arg that is already loaded in case we
+  // need to spill before we call out
+  int c_arg = total_c_args - total_in_args;
+
+  // Pre-load a static method's oop into r20.  Used both by locking code and
+  // the normal JNI call code.
+  if (method->is_static() && !is_critical_native) {
+
+    //  load oop into a register
+    __ movoop(oop_handle_reg,
+              JNIHandles::make_local(method->method_holder()->java_mirror()),
+              /*immediate*/true);
+
+    // Now handlize the static class mirror it's known not-null.
+    __ str(oop_handle_reg, Address(sp, klass_offset));
+    map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
+
+    // Now get the handle
+    __ lea(oop_handle_reg, Address(sp, klass_offset));
+    // store the klass handle as second argument
+    __ mov(c_rarg1, oop_handle_reg);
+    // and protect the arg if we must spill
+    c_arg--;
+  }
+
+  // Change state to native (we save the return address in the thread, since it might not
+  // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
+  // points into the right code segment. It does not have to be the correct return pc.
+  // We use the same pc/oopMap repeatedly when we call out
+
+  intptr_t the_pc = (intptr_t) __ pc();
+  oop_maps->add_gc_map(the_pc - start, map);
+
+  __ set_last_Java_frame(sp, noreg, (address)the_pc, rscratch1);
+
+
+  // We have all of the arguments setup at this point. We must not touch any register
+  // argument registers at this point (what if we save/restore them there are no oop?
+
+  {
+    SkipIfEqual skip(masm, &DTraceMethodProbes, false);
+    // protect the args we've loaded
+    save_args(masm, total_c_args, c_arg, out_regs);
+    __ mov_metadata(c_rarg1, method());
+    __ call_VM_leaf(
+      CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
+      rthread, c_rarg1);
+    restore_args(masm, total_c_args, c_arg, out_regs);
+  }
+
+  // RedefineClasses() tracing support for obsolete method entry
+  if (RC_TRACE_IN_RANGE(0x00001000, 0x00002000)) {
+    // protect the args we've loaded
+    save_args(masm, total_c_args, c_arg, out_regs);
+    __ mov_metadata(c_rarg1, method());
+    __ call_VM_leaf(
+      CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
+      rthread, c_rarg1);
+    restore_args(masm, total_c_args, c_arg, out_regs);
+  }
+
+  // Lock a synchronized method
+
+  // Register definitions used by locking and unlocking
+
+  const Register swap_reg = r0;
+  const Register obj_reg  = r19;  // Will contain the oop
+  const Register lock_reg = r13;  // Address of compiler lock object (BasicLock)
+  const Register old_hdr  = r13;  // value of old header at unlock time
+
+  Label slow_path_lock;
+  Label lock_done;
+
+  if (method->is_synchronized()) {
+    assert(!is_critical_native, "unhandled");
+
+
+    const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
+
+    // Get the handle (the 2nd argument)
+    __ mov(oop_handle_reg, c_rarg1);
+
+    // Get address of the box
+
+    __ lea(lock_reg, Address(sp, lock_slot_offset * VMRegImpl::stack_slot_size));
+
+    // Load the oop from the handle
+    __ ldr(obj_reg, Address(oop_handle_reg, 0));
+
+    if (UseBiasedLocking) {
+      __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch2, false, lock_done, &slow_path_lock);
+    }
+
+    // Load (object->mark() | 1) into swap_reg %r0
+    __ ldr(rscratch1, Address(obj_reg, 0));
+    __ orr(swap_reg, rscratch1, 1);
+
+    // Save (object->mark() | 1) into BasicLock's displaced header
+    __ str(swap_reg, Address(lock_reg, mark_word_offset));
+
+    // src -> dest iff dest == r0 else r0 <- dest
+    { Label here;
+      __ cmpxchgptr(r0, lock_reg, obj_reg, rscratch1, lock_done, /*fallthrough*/NULL);
+    }
+
+    // Hmm should this move to the slow path code area???
+
+    // Test if the oopMark is an obvious stack pointer, i.e.,
+    //  1) (mark & 3) == 0, and
+    //  2) sp <= mark < mark + os::pagesize()
+    // These 3 tests can be done by evaluating the following
+    // expression: ((mark - sp) & (3 - os::vm_page_size())),
+    // assuming both stack pointer and pagesize have their
+    // least significant 2 bits clear.
+    // NOTE: the oopMark is in swap_reg %r0 as the result of cmpxchg
+
+    __ sub(swap_reg, sp, swap_reg);
+    __ neg(swap_reg, swap_reg);
+    __ ands(swap_reg, swap_reg, 3 - os::vm_page_size());
+
+    // Save the test result, for recursive case, the result is zero
+    __ str(swap_reg, Address(lock_reg, mark_word_offset));
+    __ br(Assembler::NE, slow_path_lock);
+
+    // Slow path will re-enter here
+
+    __ bind(lock_done);
+  }
+
+
+  // Finally just about ready to make the JNI call
+
+
+  // get JNIEnv* which is first argument to native
+  if (!is_critical_native) {
+    __ lea(c_rarg0, Address(rthread, in_bytes(JavaThread::jni_environment_offset())));
+  }
+
+  // Now set thread in native
+  __ mov(rscratch1, _thread_in_native);
+  __ str(rscratch1, Address(rthread, JavaThread::thread_state_offset()));
+
+  {
+    int return_type = 0;
+    switch (ret_type) {
+    case T_VOID: break;
+      return_type = 0; break;
+    case T_CHAR:
+    case T_BYTE:
+    case T_SHORT:
+    case T_INT:
+    case T_BOOLEAN:
+    case T_LONG:
+      return_type = 1; break;
+    case T_ARRAY:
+    case T_OBJECT:
+      return_type = 1; break;
+    case T_FLOAT:
+      return_type = 2; break;
+    case T_DOUBLE:
+      return_type = 3; break;
+    default:
+      ShouldNotReachHere();
+    }
+    rt_call(masm, native_func,
+            int_args + 2, // AArch64 passes up to 8 args in int registers
+            float_args,   // and up to 8 float args
+            return_type);
+  }
+
+  // Unpack native results.
+  switch (ret_type) {
+  case T_BOOLEAN: __ ubfx(r0, r0, 0, 8);            break;
+  case T_CHAR   : __ ubfx(r0, r0, 0, 16);            break;
+  case T_BYTE   : __ sbfx(r0, r0, 0, 8);            break;
+  case T_SHORT  : __ sbfx(r0, r0, 0, 16);            break;
+  case T_INT    : __ sbfx(r0, r0, 0, 32);            break;
+  case T_DOUBLE :
+  case T_FLOAT  :
+    // Result is in v0 we'll save as needed
+    break;
+  case T_ARRAY:                 // Really a handle
+  case T_OBJECT:                // Really a handle
+      break; // can't de-handlize until after safepoint check
+  case T_VOID: break;
+  case T_LONG: break;
+  default       : ShouldNotReachHere();
+  }
+
+  // Switch thread to "native transition" state before reading the synchronization state.
+  // This additional state is necessary because reading and testing the synchronization
+  // state is not atomic w.r.t. GC, as this scenario demonstrates:
+  //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
+  //     VM thread changes sync state to synchronizing and suspends threads for GC.
+  //     Thread A is resumed to finish this native method, but doesn't block here since it
+  //     didn't see any synchronization is progress, and escapes.
+  __ mov(rscratch1, _thread_in_native_trans);
+  __ str(rscratch1, Address(rthread, JavaThread::thread_state_offset()));
+
+  if(os::is_MP()) {
+    if (UseMembar) {
+      // Force this write out before the read below
+      __ dmb(Assembler::SY);
+    } else {
+      // Write serialization page so VM thread can do a pseudo remote membar.
+      // We use the current thread pointer to calculate a thread specific
+      // offset to write to within the page. This minimizes bus traffic
+      // due to cache line collision.
+      __ serialize_memory(rthread, r2);
+    }
+  }
+
+  Label after_transition;
+
+  // check for safepoint operation in progress and/or pending suspend requests
+  {
+    Label Continue;
+
+    { unsigned long offset;
+      __ adrp(rscratch1,
+              ExternalAddress((address)SafepointSynchronize::address_of_state()),
+              offset);
+      __ ldrw(rscratch1, Address(rscratch1, offset));
+    }
+    __ cmpw(rscratch1, SafepointSynchronize::_not_synchronized);
+
+    Label L;
+    __ br(Assembler::NE, L);
+    __ ldrw(rscratch1, Address(rthread, JavaThread::suspend_flags_offset()));
+    __ cbz(rscratch1, Continue);
+    __ bind(L);
+
+    // Don't use call_VM as it will see a possible pending exception and forward it
+    // and never return here preventing us from clearing _last_native_pc down below.
+    //
+    save_native_result(masm, ret_type, stack_slots);
+    __ mov(c_rarg0, rthread);
+#ifndef PRODUCT
+  assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
+#endif
+    if (!is_critical_native) {
+      __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
+    } else {
+      __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans_and_transition)));
+    }
+    __ blrt(rscratch1, 1, 0, 1);
+    __ maybe_isb();
+    // Restore any method result value
+    restore_native_result(masm, ret_type, stack_slots);
+
+    if (is_critical_native) {
+      // The call above performed the transition to thread_in_Java so
+      // skip the transition logic below.
+      __ b(after_transition);
+    }
+
+    __ bind(Continue);
+  }
+
+  // change thread state
+  __ mov(rscratch1, _thread_in_Java);
+  __ str(rscratch1, Address(rthread, JavaThread::thread_state_offset()));
+  __ bind(after_transition);
+
+  Label reguard;
+  Label reguard_done;
+  __ ldrb(rscratch1, Address(rthread, JavaThread::stack_guard_state_offset()));
+  __ cmpw(rscratch1, JavaThread::stack_guard_yellow_disabled);
+  __ br(Assembler::EQ, reguard);
+  __ bind(reguard_done);
+
+  // native result if any is live
+
+  // Unlock
+  Label unlock_done;
+  Label slow_path_unlock;
+  if (method->is_synchronized()) {
+
+    // Get locked oop from the handle we passed to jni
+    __ ldr(obj_reg, Address(oop_handle_reg, 0));
+
+    Label done;
+
+    if (UseBiasedLocking) {
+      __ biased_locking_exit(obj_reg, old_hdr, done);
+    }
+
+    // Simple recursive lock?
+
+    __ ldr(rscratch1, Address(sp, lock_slot_offset * VMRegImpl::stack_slot_size));
+    __ cbz(rscratch1, done);
+
+    // Must save r0 if if it is live now because cmpxchg must use it
+    if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
+      save_native_result(masm, ret_type, stack_slots);
+    }
+
+
+    // get address of the stack lock
+    __ lea(r0, Address(sp, lock_slot_offset * VMRegImpl::stack_slot_size));
+    //  get old displaced header
+    __ ldr(old_hdr, Address(r0, 0));
+
+    // Atomic swap old header if oop still contains the stack lock
+    Label succeed;
+    __ cmpxchgptr(r0, old_hdr, obj_reg, rscratch1, succeed, &slow_path_unlock);
+    __ bind(succeed);
+
+    // slow path re-enters here
+    __ bind(unlock_done);
+    if (ret_type != T_FLOAT && ret_type != T_DOUBLE && ret_type != T_VOID) {
+      restore_native_result(masm, ret_type, stack_slots);
+    }
+
+    __ bind(done);
+
+  }
+  {
+    SkipIfEqual skip(masm, &DTraceMethodProbes, false);
+    save_native_result(masm, ret_type, stack_slots);
+    __ mov_metadata(c_rarg1, method());
+    __ call_VM_leaf(
+         CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
+         rthread, c_rarg1);
+    restore_native_result(masm, ret_type, stack_slots);
+  }
+
+  __ reset_last_Java_frame(false, true);
+
+  // Unpack oop result
+  if (ret_type == T_OBJECT || ret_type == T_ARRAY) {
+      Label L;
+      __ cbz(r0, L);
+      __ ldr(r0, Address(r0, 0));
+      __ bind(L);
+      __ verify_oop(r0);
+  }
+
+  if (!is_critical_native) {
+    // reset handle block
+    __ ldr(r2, Address(rthread, JavaThread::active_handles_offset()));
+    __ str(zr, Address(r2, JNIHandleBlock::top_offset_in_bytes()));
+  }
+
+  __ leave();
+
+  if (!is_critical_native) {
+    // Any exception pending?
+    __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
+    __ cbnz(rscratch1, exception_pending);
+  }
+
+  // record exit from native wrapper code
+  if (NotifySimulator) {
+    __ notify(Assembler::method_reentry);
+  }
+
+  // We're done
+  __ ret(lr);
+
+  // Unexpected paths are out of line and go here
+
+  if (!is_critical_native) {
+    // forward the exception
+    __ bind(exception_pending);
+
+    // and forward the exception
+    __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
+  }
+
+  // Slow path locking & unlocking
+  if (method->is_synchronized()) {
+
+    // BEGIN Slow path lock
+    __ bind(slow_path_lock);
+
+    // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
+    // args are (oop obj, BasicLock* lock, JavaThread* thread)
+
+    // protect the args we've loaded
+    save_args(masm, total_c_args, c_arg, out_regs);
+
+    __ mov(c_rarg0, obj_reg);
+    __ mov(c_rarg1, lock_reg);
+    __ mov(c_rarg2, rthread);
+
+    // Not a leaf but we have last_Java_frame setup as we want
+    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
+    restore_args(masm, total_c_args, c_arg, out_regs);
+
+#ifdef ASSERT
+    { Label L;
+      __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
+      __ cbz(rscratch1, L);
+      __ stop("no pending exception allowed on exit from monitorenter");
+      __ bind(L);
+    }
+#endif
+    __ b(lock_done);
+
+    // END Slow path lock
+
+    // BEGIN Slow path unlock
+    __ bind(slow_path_unlock);
+
+    // If we haven't already saved the native result we must save it now as xmm registers
+    // are still exposed.
+
+    if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
+      save_native_result(masm, ret_type, stack_slots);
+    }
+
+    __ lea(c_rarg1, Address(sp, lock_slot_offset * VMRegImpl::stack_slot_size));
+    __ mov(c_rarg0, obj_reg);
+
+    // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
+    // NOTE that obj_reg == r19 currently
+    __ ldr(r19, Address(rthread, in_bytes(Thread::pending_exception_offset())));
+    __ str(zr, Address(rthread, in_bytes(Thread::pending_exception_offset())));
+
+    rt_call(masm, CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C), 2, 0, 1);
+
+#ifdef ASSERT
+    {
+      Label L;
+      __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
+      __ cbz(rscratch1, L);
+      __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
+      __ bind(L);
+    }
+#endif /* ASSERT */
+
+    __ str(r19, Address(rthread, in_bytes(Thread::pending_exception_offset())));
+
+    if (ret_type == T_FLOAT || ret_type == T_DOUBLE ) {
+      restore_native_result(masm, ret_type, stack_slots);
+    }
+    __ b(unlock_done);
+
+    // END Slow path unlock
+
+  } // synchronized
+
+  // SLOW PATH Reguard the stack if needed
+
+  __ bind(reguard);
+  save_native_result(masm, ret_type, stack_slots);
+  rt_call(masm, CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages), 0, 0, 0);
+  restore_native_result(masm, ret_type, stack_slots);
+  // and continue
+  __ b(reguard_done);
+
+
+
+  __ flush();
+
+  nmethod *nm = nmethod::new_native_nmethod(method,
+                                            compile_id,
+                                            masm->code(),
+                                            vep_offset,
+                                            frame_complete,
+                                            stack_slots / VMRegImpl::slots_per_word,
+                                            (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
+                                            in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
+                                            oop_maps);
+
+  if (is_critical_native) {
+    nm->set_lazy_critical_native(true);
+  }
+
+  return nm;
+
+}
+
+
+#ifdef HAVE_DTRACE_H
+// ---------------------------------------------------------------------------
+// Generate a dtrace nmethod for a given signature.  The method takes arguments
+// in the Java compiled code convention, marshals them to the native
+// abi and then leaves nops at the position you would expect to call a native
+// function. When the probe is enabled the nops are replaced with a trap
+// instruction that dtrace inserts and the trace will cause a notification
+// to dtrace.
+//
+// The probes are only able to take primitive types and java/lang/String as
+// arguments.  No other java types are allowed. Strings are converted to utf8
+// strings so that from dtrace point of view java strings are converted to C
+// strings. There is an arbitrary fixed limit on the total space that a method
+// can use for converting the strings. (256 chars per string in the signature).
+// So any java string larger then this is truncated.
+
+static int  fp_offset[ConcreteRegisterImpl::number_of_registers] = { 0 };
+static bool offsets_initialized = false;
+
+
+nmethod *SharedRuntime::generate_dtrace_nmethod(MacroAssembler *masm,
+                                                methodHandle method) { Unimplemented(); return 0; }
+
+#endif // HAVE_DTRACE_H
+
+// this function returns the adjust size (in number of words) to a c2i adapter
+// activation for use during deoptimization
+int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals) {
+  assert(callee_locals >= callee_parameters,
+          "test and remove; got more parms than locals");
+  if (callee_locals < callee_parameters)
+    return 0;                   // No adjustment for negative locals
+  int diff = (callee_locals - callee_parameters) * Interpreter::stackElementWords;
+  // diff is counted in stack words
+  return round_to(diff, 2);
+}
+
+
+//------------------------------generate_deopt_blob----------------------------
+void SharedRuntime::generate_deopt_blob() {
+  // Allocate space for the code
+  ResourceMark rm;
+  // Setup code generation tools
+  CodeBuffer buffer("deopt_blob", 2048, 1024);
+  MacroAssembler* masm = new MacroAssembler(&buffer);
+  int frame_size_in_words;
+  OopMap* map = NULL;
+  OopMapSet *oop_maps = new OopMapSet();
+
+#ifdef BUILTIN_SIM
+  AArch64Simulator *simulator;
+  if (NotifySimulator) {
+    simulator = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
+    simulator->notifyCompile(const_cast<char*>("SharedRuntime::deopt_blob"), __ pc());
+  }
+#endif
+
+  // -------------
+  // This code enters when returning to a de-optimized nmethod.  A return
+  // address has been pushed on the the stack, and return values are in
+  // registers.
+  // If we are doing a normal deopt then we were called from the patched
+  // nmethod from the point we returned to the nmethod. So the return
+  // address on the stack is wrong by NativeCall::instruction_size
+  // We will adjust the value so it looks like we have the original return
+  // address on the stack (like when we eagerly deoptimized).
+  // In the case of an exception pending when deoptimizing, we enter
+  // with a return address on the stack that points after the call we patched
+  // into the exception handler. We have the following register state from,
+  // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
+  //    r0: exception oop
+  //    r19: exception handler
+  //    r3: throwing pc
+  // So in this case we simply jam r3 into the useless return address and
+  // the stack looks just like we want.
+  //
+  // At this point we need to de-opt.  We save the argument return
+  // registers.  We call the first C routine, fetch_unroll_info().  This
+  // routine captures the return values and returns a structure which
+  // describes the current frame size and the sizes of all replacement frames.
+  // The current frame is compiled code and may contain many inlined
+  // functions, each with their own JVM state.  We pop the current frame, then
+  // push all the new frames.  Then we call the C routine unpack_frames() to
+  // populate these frames.  Finally unpack_frames() returns us the new target
+  // address.  Notice that callee-save registers are BLOWN here; they have
+  // already been captured in the vframeArray at the time the return PC was
+  // patched.
+  address start = __ pc();
+  Label cont;
+
+  // Prolog for non exception case!
+
+  // Save everything in sight.
+  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
+
+  // Normal deoptimization.  Save exec mode for unpack_frames.
+  __ movw(rcpool, Deoptimization::Unpack_deopt); // callee-saved
+  __ b(cont);
+
+  int reexecute_offset = __ pc() - start;
+
+  // Reexecute case
+  // return address is the pc describes what bci to do re-execute at
+
+  // No need to update map as each call to save_live_registers will produce identical oopmap
+  (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
+
+  __ movw(rcpool, Deoptimization::Unpack_reexecute); // callee-saved
+  __ b(cont);
+
+  int exception_offset = __ pc() - start;
+
+  // Prolog for exception case
+
+  // all registers are dead at this entry point, except for r0, and
+  // r3 which contain the exception oop and exception pc
+  // respectively.  Set them in TLS and fall thru to the
+  // unpack_with_exception_in_tls entry point.
+
+  __ str(r3, Address(rthread, JavaThread::exception_pc_offset()));
+  __ str(r0, Address(rthread, JavaThread::exception_oop_offset()));
+
+  int exception_in_tls_offset = __ pc() - start;
+
+  // new implementation because exception oop is now passed in JavaThread
+
+  // Prolog for exception case
+  // All registers must be preserved because they might be used by LinearScan
+  // Exceptiop oop and throwing PC are passed in JavaThread
+  // tos: stack at point of call to method that threw the exception (i.e. only
+  // args are on the stack, no return address)
+
+  // The return address pushed by save_live_registers will be patched
+  // later with the throwing pc. The correct value is not available
+  // now because loading it from memory would destroy registers.
+
+  // NB: The SP at this point must be the SP of the method that is
+  // being deoptimized.  Deoptimization assumes that the frame created
+  // here by save_live_registers is immediately below the method's SP.
+  // This is a somewhat fragile mechanism.
+
+  // Save everything in sight.
+  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
+
+  // Now it is safe to overwrite any register
+
+  // Deopt during an exception.  Save exec mode for unpack_frames.
+  __ mov(rcpool, Deoptimization::Unpack_exception); // callee-saved
+
+  // load throwing pc from JavaThread and patch it as the return address
+  // of the current frame. Then clear the field in JavaThread
+
+  __ ldr(r3, Address(rthread, JavaThread::exception_pc_offset()));
+  __ str(r3, Address(rfp, wordSize));
+  __ str(zr, Address(rthread, JavaThread::exception_pc_offset()));
+
+#ifdef ASSERT
+  // verify that there is really an exception oop in JavaThread
+  __ ldr(r0, Address(rthread, JavaThread::exception_oop_offset()));
+  __ verify_oop(r0);
+
+  // verify that there is no pending exception
+  Label no_pending_exception;
+  __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+  __ cbz(rscratch1, no_pending_exception);
+  __ stop("must not have pending exception here");
+  __ bind(no_pending_exception);
+#endif
+
+  __ bind(cont);
+
+  // Call C code.  Need thread and this frame, but NOT official VM entry
+  // crud.  We cannot block on this call, no GC can happen.
+  //
+  // UnrollBlock* fetch_unroll_info(JavaThread* thread)
+
+  // fetch_unroll_info needs to call last_java_frame().
+
+  Label retaddr;
+  __ set_last_Java_frame(sp, noreg, retaddr, rscratch1);
+#ifdef ASSERT0
+  { Label L;
+    __ ldr(rscratch1, Address(rthread,
+                              JavaThread::last_Java_fp_offset()));
+    __ cbz(rscratch1, L);
+    __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
+    __ bind(L);
+  }
+#endif // ASSERT
+  __ mov(c_rarg0, rthread);
+  __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
+  __ blrt(rscratch1, 1, 0, 1);
+  __ bind(retaddr);
+
+  // Need to have an oopmap that tells fetch_unroll_info where to
+  // find any register it might need.
+  oop_maps->add_gc_map(__ pc() - start, map);
+
+  __ reset_last_Java_frame(false, true);
+
+  // Load UnrollBlock* into rdi
+  __ mov(r5, r0);
+
+   Label noException;
+  __ cmpw(rcpool, Deoptimization::Unpack_exception);   // Was exception pending?
+  __ br(Assembler::NE, noException);
+  __ ldr(r0, Address(rthread, JavaThread::exception_oop_offset()));
+  // QQQ this is useless it was NULL above
+  __ ldr(r3, Address(rthread, JavaThread::exception_pc_offset()));
+  __ str(zr, Address(rthread, JavaThread::exception_oop_offset()));
+  __ str(zr, Address(rthread, JavaThread::exception_pc_offset()));
+
+  __ verify_oop(r0);
+
+  // Overwrite the result registers with the exception results.
+  __ str(r0, Address(sp, RegisterSaver::r0_offset_in_bytes()));
+  // I think this is useless
+  // __ str(r3, Address(sp, RegisterSaver::r3_offset_in_bytes()));
+
+  __ bind(noException);
+
+  // Only register save data is on the stack.
+  // Now restore the result registers.  Everything else is either dead
+  // or captured in the vframeArray.
+  RegisterSaver::restore_result_registers(masm);
+
+  // All of the register save area has been popped of the stack. Only the
+  // return address remains.
+
+  // Pop all the frames we must move/replace.
+  //
+  // Frame picture (youngest to oldest)
+  // 1: self-frame (no frame link)
+  // 2: deopting frame  (no frame link)
+  // 3: caller of deopting frame (could be compiled/interpreted).
+  //
+  // Note: by leaving the return address of self-frame on the stack
+  // and using the size of frame 2 to adjust the stack
+  // when we are done the return to frame 3 will still be on the stack.
+
+  // Pop deoptimized frame
+  __ ldrw(r2, Address(r5, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
+  __ sub(r2, r2, 2 * wordSize);
+  __ add(sp, sp, r2);
+  __ ldp(rfp, lr, __ post(sp, 2 * wordSize));
+  // LR should now be the return address to the caller (3)
+
+#ifdef ASSERT
+  // Compilers generate code that bang the stack by as much as the
+  // interpreter would need. So this stack banging should never
+  // trigger a fault. Verify that it does not on non product builds.
+  if (UseStackBanging) {
+    __ ldrw(r19, Address(r5, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
+    __ bang_stack_size(r19, r2);
+  }
+#endif
+  // Load address of array of frame pcs into r2
+  __ ldr(r2, Address(r5, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
+
+  // Trash the old pc
+  // __ addptr(sp, wordSize);  FIXME ????
+
+  // Load address of array of frame sizes into r4
+  __ ldr(r4, Address(r5, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
+
+  // Load counter into r3
+  __ ldrw(r3, Address(r5, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
+
+  // Now adjust the caller's stack to make up for the extra locals
+  // but record the original sp so that we can save it in the skeletal interpreter
+  // frame and the stack walking of interpreter_sender will get the unextended sp
+  // value and not the "real" sp value.
+
+  const Register sender_sp = r6;
+
+  __ mov(sender_sp, sp);
+  __ ldrw(r19, Address(r5,
+                       Deoptimization::UnrollBlock::
+                       caller_adjustment_offset_in_bytes()));
+  __ sub(sp, sp, r19);
+
+  // Push interpreter frames in a loop
+  __ mov(rscratch1, (address)0xDEADDEAD);        // Make a recognizable pattern
+  __ mov(rscratch2, rscratch1);
+  Label loop;
+  __ bind(loop);
+  __ ldr(r19, Address(__ post(r4, wordSize)));          // Load frame size
+  __ sub(r19, r19, 2*wordSize);           // We'll push pc and fp by hand
+  __ ldr(lr, Address(__ post(r2, wordSize)));  // Load pc
+  __ enter();                           // Save old & set new fp
+  __ sub(sp, sp, r19);                  // Prolog
+  // This value is corrected by layout_activation_impl
+  __ str(zr, Address(rfp, frame::interpreter_frame_last_sp_offset * wordSize));
+  __ str(sender_sp, Address(rfp, frame::interpreter_frame_sender_sp_offset * wordSize)); // Make it walkable
+  __ mov(sender_sp, sp);               // Pass sender_sp to next frame
+  __ sub(r3, r3, 1);                   // Decrement counter
+  __ cbnz(r3, loop);
+
+    // Re-push self-frame
+  __ ldr(lr, Address(r2));
+  __ enter();
+
+  // Allocate a full sized register save area.  We subtract 2 because
+  // enter() just pushed 2 words
+  __ sub(sp, sp, (frame_size_in_words - 2) * wordSize);
+
+  // Restore frame locals after moving the frame
+  __ strd(v0, Address(sp, RegisterSaver::v0_offset_in_bytes()));
+  __ str(r0, Address(sp, RegisterSaver::r0_offset_in_bytes()));
+
+  // Call C code.  Need thread but NOT official VM entry
+  // crud.  We cannot block on this call, no GC can happen.  Call should
+  // restore return values to their stack-slots with the new SP.
+  //
+  // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
+
+  // Use rfp because the frames look interpreted now
+  // Don't need the precise return PC here, just precise enough to point into this code blob.
+  address the_pc = __ pc();
+  __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
+
+  __ mov(c_rarg0, rthread);
+  __ movw(c_rarg1, rcpool); // second arg: exec_mode
+  __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
+  __ blrt(rscratch1, 2, 0, 0);
+
+  // Set an oopmap for the call site
+  // Use the same PC we used for the last java frame
+  oop_maps->add_gc_map(the_pc - start,
+                       new OopMap( frame_size_in_words, 0 ));
+
+  // Clear fp AND pc
+  __ reset_last_Java_frame(true, true);
+
+  // Collect return values
+  __ ldrd(v0, Address(sp, RegisterSaver::v0_offset_in_bytes()));
+  __ ldr(r0, Address(sp, RegisterSaver::r0_offset_in_bytes()));
+  // I think this is useless (throwing pc?)
+  // __ ldr(r3, Address(sp, RegisterSaver::r3_offset_in_bytes()));
+
+  // Pop self-frame.
+  __ leave();                           // Epilog
+
+  // Jump to interpreter
+  __ ret(lr);
+
+  // Make sure all code is generated
+  masm->flush();
+
+  _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
+  _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
+
+#ifdef BUILTIN_SIM
+  if (NotifySimulator) {
+    unsigned char *base = _deopt_blob->code_begin();
+    simulator->notifyRelocate(start, base - start);
+  }
+#endif
+}
+
+uint SharedRuntime::out_preserve_stack_slots() {
+  return 0;
+}
+
+#ifdef COMPILER2
+//------------------------------generate_uncommon_trap_blob--------------------
+void SharedRuntime::generate_uncommon_trap_blob() {
+  // Allocate space for the code
+  ResourceMark rm;
+  // Setup code generation tools
+  CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
+  MacroAssembler* masm = new MacroAssembler(&buffer);
+
+#ifdef BUILTIN_SIM
+  AArch64Simulator *simulator;
+  if (NotifySimulator) {
+    simulator = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
+    simulator->notifyCompile(const_cast<char*>("SharedRuntime:uncommon_trap_blob"), __ pc());
+  }
+#endif
+
+  assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
+
+  address start = __ pc();
+
+  // Push self-frame.  We get here with a return address in LR
+  // and sp should be 16 byte aligned
+  // push rfp and retaddr by hand
+  __ stp(rfp, lr, Address(__ pre(sp, -2 * wordSize)));
+  // we don't expect an arg reg save area
+#ifndef PRODUCT
+  assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
+#endif
+  // compiler left unloaded_class_index in j_rarg0 move to where the
+  // runtime expects it.
+  if (c_rarg1 != j_rarg0) {
+    __ movw(c_rarg1, j_rarg0);
+  }
+
+  // we need to set the past SP to the stack pointer of the stub frame
+  // and the pc to the address where this runtime call will return
+  // although actually any pc in this code blob will do).
+  Label retaddr;
+  __ set_last_Java_frame(sp, noreg, retaddr, rscratch1);
+
+  // Call C code.  Need thread but NOT official VM entry
+  // crud.  We cannot block on this call, no GC can happen.  Call should
+  // capture callee-saved registers as well as return values.
+  // Thread is in rdi already.
+  //
+  // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
+  //
+  // n.b. 2 gp args, 0 fp args, integral return type
+
+  __ mov(c_rarg0, rthread);
+  __ lea(rscratch1,
+         RuntimeAddress(CAST_FROM_FN_PTR(address,
+                                         Deoptimization::uncommon_trap)));
+  __ blrt(rscratch1, 2, 0, MacroAssembler::ret_type_integral);
+  __ bind(retaddr);
+
+  // Set an oopmap for the call site
+  OopMapSet* oop_maps = new OopMapSet();
+  OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
+
+  // location of rfp is known implicitly by the frame sender code
+
+  oop_maps->add_gc_map(__ pc() - start, map);
+
+  __ reset_last_Java_frame(false, true);
+
+  // move UnrollBlock* into r4
+  __ mov(r4, r0);
+
+  // Pop all the frames we must move/replace.
+  //
+  // Frame picture (youngest to oldest)
+  // 1: self-frame (no frame link)
+  // 2: deopting frame  (no frame link)
+  // 3: caller of deopting frame (could be compiled/interpreted).
+
+  // Pop self-frame.  We have no frame, and must rely only on r0 and sp.
+  __ add(sp, sp, (SimpleRuntimeFrame::framesize) << LogBytesPerInt); // Epilog!
+
+  // Pop deoptimized frame (int)
+  __ ldrw(r2, Address(r4,
+                      Deoptimization::UnrollBlock::
+                      size_of_deoptimized_frame_offset_in_bytes()));
+  __ sub(r2, r2, 2 * wordSize);
+  __ add(sp, sp, r2);
+  __ ldp(rfp, lr, __ post(sp, 2 * wordSize));
+  // LR should now be the return address to the caller (3) frame
+
+#ifdef ASSERT
+  // Compilers generate code that bang the stack by as much as the
+  // interpreter would need. So this stack banging should never
+  // trigger a fault. Verify that it does not on non product builds.
+  if (UseStackBanging) {
+    __ ldrw(r1, Address(r4,
+                        Deoptimization::UnrollBlock::
+                        total_frame_sizes_offset_in_bytes()));
+    __ bang_stack_size(r1, r2);
+  }
+#endif
+
+  // Load address of array of frame pcs into r2 (address*)
+  __ ldr(r2, Address(r4,
+                     Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
+
+  // Load address of array of frame sizes into r5 (intptr_t*)
+  __ ldr(r5, Address(r4,
+                     Deoptimization::UnrollBlock::
+                     frame_sizes_offset_in_bytes()));
+
+  // Counter
+  __ ldrw(r3, Address(r4,
+                      Deoptimization::UnrollBlock::
+                      number_of_frames_offset_in_bytes())); // (int)
+
+  // Now adjust the caller's stack to make up for the extra locals but
+  // record the original sp so that we can save it in the skeletal
+  // interpreter frame and the stack walking of interpreter_sender
+  // will get the unextended sp value and not the "real" sp value.
+
+  const Register sender_sp = r8;
+
+  __ mov(sender_sp, sp);
+  __ ldrw(r1, Address(r4,
+                      Deoptimization::UnrollBlock::
+                      caller_adjustment_offset_in_bytes())); // (int)
+  __ sub(sp, sp, r1);
+
+  // Push interpreter frames in a loop
+  Label loop;
+  __ bind(loop);
+  __ ldr(r1, Address(r5, 0));       // Load frame size
+  __ sub(r1, r1, 2 * wordSize);     // We'll push pc and rfp by hand
+  __ ldr(lr, Address(r2, 0));       // Save return address
+  __ enter();                       // and old rfp & set new rfp
+  __ sub(sp, sp, r1);               // Prolog
+  __ str(sender_sp, Address(rfp, frame::interpreter_frame_sender_sp_offset * wordSize)); // Make it walkable
+  // This value is corrected by layout_activation_impl
+  __ str(zr, Address(rfp, frame::interpreter_frame_last_sp_offset * wordSize));
+  __ mov(sender_sp, sp);          // Pass sender_sp to next frame
+  __ add(r5, r5, wordSize);       // Bump array pointer (sizes)
+  __ add(r2, r2, wordSize);       // Bump array pointer (pcs)
+  __ subsw(r3, r3, 1);            // Decrement counter
+  __ br(Assembler::GT, loop);
+  __ ldr(lr, Address(r2, 0));     // save final return address
+  // Re-push self-frame
+  __ enter();                     // & old rfp & set new rfp
+
+  // Use rfp because the frames look interpreted now
+  // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
+  // Don't need the precise return PC here, just precise enough to point into this code blob.
+  address the_pc = __ pc();
+  __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
+
+  // Call C code.  Need thread but NOT official VM entry
+  // crud.  We cannot block on this call, no GC can happen.  Call should
+  // restore return values to their stack-slots with the new SP.
+  // Thread is in rdi already.
+  //
+  // BasicType unpack_frames(JavaThread* thread, int exec_mode);
+  //
+  // n.b. 2 gp args, 0 fp args, integral return type
+
+  // sp should already be aligned
+  __ mov(c_rarg0, rthread);
+  __ movw(c_rarg1, (unsigned)Deoptimization::Unpack_uncommon_trap);
+  __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
+  __ blrt(rscratch1, 2, 0, MacroAssembler::ret_type_integral);
+
+  // Set an oopmap for the call site
+  // Use the same PC we used for the last java frame
+  oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
+
+  // Clear fp AND pc
+  __ reset_last_Java_frame(true, true);
+
+  // Pop self-frame.
+  __ leave();                 // Epilog
+
+  // Jump to interpreter
+  __ ret(lr);
+
+  // Make sure all code is generated
+  masm->flush();
+
+  _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
+                                                 SimpleRuntimeFrame::framesize >> 1);
+
+#ifdef BUILTIN_SIM
+  if (NotifySimulator) {
+    unsigned char *base = _deopt_blob->code_begin();
+    simulator->notifyRelocate(start, base - start);
+  }
+#endif
+}
+#endif // COMPILER2
+
+
+//------------------------------generate_handler_blob------
+//
+// Generate a special Compile2Runtime blob that saves all registers,
+// and setup oopmap.
+//
+SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
+  ResourceMark rm;
+  OopMapSet *oop_maps = new OopMapSet();
+  OopMap* map;
+
+  // Allocate space for the code.  Setup code generation tools.
+  CodeBuffer buffer("handler_blob", 2048, 1024);
+  MacroAssembler* masm = new MacroAssembler(&buffer);
+
+  address start   = __ pc();
+  address call_pc = NULL;
+  int frame_size_in_words;
+  bool cause_return = (poll_type == POLL_AT_RETURN);
+  bool save_vectors = (poll_type == POLL_AT_VECTOR_LOOP);
+
+  // Save registers, fpu state, and flags
+  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
+
+  // The following is basically a call_VM.  However, we need the precise
+  // address of the call in order to generate an oopmap. Hence, we do all the
+  // work outselves.
+
+  Label retaddr;
+  __ set_last_Java_frame(sp, noreg, retaddr, rscratch1);
+
+  // The return address must always be correct so that frame constructor never
+  // sees an invalid pc.
+
+  if (!cause_return) {
+    // overwrite the return address pushed by save_live_registers
+    __ ldr(c_rarg0, Address(rthread, JavaThread::saved_exception_pc_offset()));
+    __ str(c_rarg0, Address(rfp, wordSize));
+  }
+
+  // Do the call
+  __ mov(c_rarg0, rthread);
+  __ lea(rscratch1, RuntimeAddress(call_ptr));
+  __ blrt(rscratch1, 1, 0, 1);
+  __ bind(retaddr);
+
+  // Set an oopmap for the call site.  This oopmap will map all
+  // oop-registers and debug-info registers as callee-saved.  This
+  // will allow deoptimization at this safepoint to find all possible
+  // debug-info recordings, as well as let GC find all oops.
+
+  oop_maps->add_gc_map( __ pc() - start, map);
+
+  Label noException;
+
+  __ reset_last_Java_frame(false, true);
+
+  __ maybe_isb();
+  __ membar(Assembler::LoadLoad | Assembler::LoadStore);
+
+  __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+  __ cbz(rscratch1, noException);
+
+  // Exception pending
+
+  RegisterSaver::restore_live_registers(masm);
+
+  __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
+
+  // No exception case
+  __ bind(noException);
+
+  // Normal exit, restore registers and exit.
+  RegisterSaver::restore_live_registers(masm);
+
+  __ ret(lr);
+
+  // Make sure all code is generated
+  masm->flush();
+
+  // Fill-out other meta info
+  return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
+}
+
+//
+// generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
+//
+// Generate a stub that calls into vm to find out the proper destination
+// of a java call. All the argument registers are live at this point
+// but since this is generic code we don't know what they are and the caller
+// must do any gc of the args.
+//
+RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
+  assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
+
+  // allocate space for the code
+  ResourceMark rm;
+
+  CodeBuffer buffer(name, 1000, 512);
+  MacroAssembler* masm                = new MacroAssembler(&buffer);
+
+  int frame_size_in_words;
+
+  OopMapSet *oop_maps = new OopMapSet();
+  OopMap* map = NULL;
+
+  int start = __ offset();
+
+  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
+
+  int frame_complete = __ offset();
+
+  {
+    Label retaddr;
+    __ set_last_Java_frame(sp, noreg, retaddr, rscratch1);
+
+    __ mov(c_rarg0, rthread);
+    __ lea(rscratch1, RuntimeAddress(destination));
+
+    __ blrt(rscratch1, 1, 0, 1);
+    __ bind(retaddr);
+  }
+
+  // Set an oopmap for the call site.
+  // We need this not only for callee-saved registers, but also for volatile
+  // registers that the compiler might be keeping live across a safepoint.
+
+  oop_maps->add_gc_map( __ offset() - start, map);
+
+  __ maybe_isb();
+
+  // r0 contains the address we are going to jump to assuming no exception got installed
+
+  // clear last_Java_sp
+  __ reset_last_Java_frame(false, true);
+  // check for pending exceptions
+  Label pending;
+  __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+  __ cbnz(rscratch1, pending);
+
+  // get the returned Method*
+  __ get_vm_result_2(rmethod, rthread);
+  __ str(rmethod, Address(sp, RegisterSaver::reg_offset_in_bytes(rmethod)));
+
+  // r0 is where we want to jump, overwrite rscratch1 which is saved and scratch
+  __ str(r0, Address(sp, RegisterSaver::rscratch1_offset_in_bytes()));
+  RegisterSaver::restore_live_registers(masm);
+
+  // We are back the the original state on entry and ready to go.
+
+  __ br(rscratch1);
+
+  // Pending exception after the safepoint
+
+  __ bind(pending);
+
+  RegisterSaver::restore_live_registers(masm);
+
+  // exception pending => remove activation and forward to exception handler
+
+  __ str(zr, Address(rthread, JavaThread::vm_result_offset()));
+
+  __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
+  __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
+
+  // -------------
+  // make sure all code is generated
+  masm->flush();
+
+  // return the  blob
+  // frame_size_words or bytes??
+  return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
+}
+
+
+#ifdef COMPILER2
+// This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
+//
+//------------------------------generate_exception_blob---------------------------
+// creates exception blob at the end
+// Using exception blob, this code is jumped from a compiled method.
+// (see emit_exception_handler in x86_64.ad file)
+//
+// Given an exception pc at a call we call into the runtime for the
+// handler in this method. This handler might merely restore state
+// (i.e. callee save registers) unwind the frame and jump to the
+// exception handler for the nmethod if there is no Java level handler
+// for the nmethod.
+//
+// This code is entered with a jmp.
+//
+// Arguments:
+//   r0: exception oop
+//   r3: exception pc
+//
+// Results:
+//   r0: exception oop
+//   r3: exception pc in caller or ???
+//   destination: exception handler of caller
+//
+// Note: the exception pc MUST be at a call (precise debug information)
+//       Registers r0, r3, r2, r4, r5, r8-r11 are not callee saved.
+//
+
+void OptoRuntime::generate_exception_blob() {
+  assert(!OptoRuntime::is_callee_saved_register(R3_num), "");
+  assert(!OptoRuntime::is_callee_saved_register(R0_num), "");
+  assert(!OptoRuntime::is_callee_saved_register(R2_num), "");
+
+  assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
+
+  // Allocate space for the code
+  ResourceMark rm;
+  // Setup code generation tools
+  CodeBuffer buffer("exception_blob", 2048, 1024);
+  MacroAssembler* masm = new MacroAssembler(&buffer);
+
+  // TODO check various assumptions made here
+  //
+  // make sure we do so before running this
+
+  address start = __ pc();
+
+  // push rfp and retaddr by hand
+  // Exception pc is 'return address' for stack walker
+  __ stp(rfp, lr, Address(__ pre(sp, -2 * wordSize)));
+  // there are no callee save registers and we don't expect an
+  // arg reg save area
+#ifndef PRODUCT
+  assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
+#endif
+  // Store exception in Thread object. We cannot pass any arguments to the
+  // handle_exception call, since we do not want to make any assumption
+  // about the size of the frame where the exception happened in.
+  __ str(r0, Address(rthread, JavaThread::exception_oop_offset()));
+  __ str(r3, Address(rthread, JavaThread::exception_pc_offset()));
+
+  // This call does all the hard work.  It checks if an exception handler
+  // exists in the method.
+  // If so, it returns the handler address.
+  // If not, it prepares for stack-unwinding, restoring the callee-save
+  // registers of the frame being removed.
+  //
+  // address OptoRuntime::handle_exception_C(JavaThread* thread)
+  //
+  // n.b. 1 gp arg, 0 fp args, integral return type
+
+  // the stack should always be aligned
+  address the_pc = __ pc();
+  __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
+  __ mov(c_rarg0, rthread);
+  __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C)));
+  __ blrt(rscratch1, 1, 0, MacroAssembler::ret_type_integral);
+  __ maybe_isb();
+
+  // Set an oopmap for the call site.  This oopmap will only be used if we
+  // are unwinding the stack.  Hence, all locations will be dead.
+  // Callee-saved registers will be the same as the frame above (i.e.,
+  // handle_exception_stub), since they were restored when we got the
+  // exception.
+
+  OopMapSet* oop_maps = new OopMapSet();
+
+  oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
+
+  __ reset_last_Java_frame(false, true);
+
+  // Restore callee-saved registers
+
+  // rfp is an implicitly saved callee saved register (i.e. the calling
+  // convention will save restore it in prolog/epilog) Other than that
+  // there are no callee save registers now that adapter frames are gone.
+  // and we dont' expect an arg reg save area
+  __ ldp(rfp, r3, Address(__ post(sp, 2 * wordSize)));
+
+  // r0: exception handler
+
+  // Restore SP from BP if the exception PC is a MethodHandle call site.
+  __ ldrw(rscratch1, Address(rthread, JavaThread::is_method_handle_return_offset()));
+  // n.b. Intel uses special register rbp_mh_SP_save here but we will
+  // just hard wire rfp
+  __ cmpw(rscratch1, zr);
+  // the obvious way to conditionally copy rfp to sp if NE
+  // Label skip;
+  // __ br(Assembler::EQ, skip);
+  // __ mov(sp, rfp);
+  // __ bind(skip);
+  // same but branchless
+  __ mov(rscratch1, sp);
+  __ csel(rscratch1, rfp, rscratch1, Assembler::NE);
+  __ mov(sp, rscratch1);
+
+  // We have a handler in r0 (could be deopt blob).
+  __ mov(r8, r0);
+
+  // Get the exception oop
+  __ ldr(r0, Address(rthread, JavaThread::exception_oop_offset()));
+  // Get the exception pc in case we are deoptimized
+  __ ldr(r4, Address(rthread, JavaThread::exception_pc_offset()));
+#ifdef ASSERT
+  __ str(zr, Address(rthread, JavaThread::exception_handler_pc_offset()));
+  __ str(zr, Address(rthread, JavaThread::exception_pc_offset()));
+#endif
+  // Clear the exception oop so GC no longer processes it as a root.
+  __ str(zr, Address(rthread, JavaThread::exception_oop_offset()));
+
+  // r0: exception oop
+  // r8:  exception handler
+  // r4: exception pc
+  // Jump to handler
+
+  __ br(r8);
+
+  // Make sure all code is generated
+  masm->flush();
+
+  // Set exception blob
+  _exception_blob =  ExceptionBlob::create(&buffer, oop_maps, SimpleRuntimeFrame::framesize >> 1);
+}
+#endif // COMPILER2

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,2562 @@
+/*
+ * Copyright (c) 2003, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "interpreter/interpreter.hpp"
+#include "nativeInst_aarch64.hpp"
+#include "oops/instanceOop.hpp"
+#include "oops/method.hpp"
+#include "oops/objArrayKlass.hpp"
+#include "oops/oop.inline.hpp"
+#include "prims/methodHandles.hpp"
+#include "runtime/frame.inline.hpp"
+#include "runtime/handles.inline.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubCodeGenerator.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "runtime/thread.inline.hpp"
+#include "utilities/top.hpp"
+#ifdef COMPILER2
+#include "opto/runtime.hpp"
+#endif
+
+#ifdef BUILTIN_SIM
+#include "../../../../../../simulator/simulator.hpp"
+#endif
+
+// Declaration and definition of StubGenerator (no .hpp file).
+// For a more detailed description of the stub routine structure
+// see the comment in stubRoutines.hpp
+
+#undef __
+#define __ _masm->
+#define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
+
+#ifdef PRODUCT
+#define BLOCK_COMMENT(str) /* nothing */
+#else
+#define BLOCK_COMMENT(str) __ block_comment(str)
+#endif
+
+#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
+
+// Stub Code definitions
+
+class StubGenerator: public StubCodeGenerator {
+ private:
+
+#ifdef PRODUCT
+#define inc_counter_np(counter) ((void)0)
+#else
+  void inc_counter_np_(int& counter) {
+    __ lea(rscratch2, ExternalAddress((address)&counter));
+    __ ldrw(rscratch1, Address(rscratch2));
+    __ addw(rscratch1, rscratch1, 1);
+    __ strw(rscratch1, Address(rscratch2));
+  }
+#define inc_counter_np(counter) \
+  BLOCK_COMMENT("inc_counter " #counter); \
+  inc_counter_np_(counter);
+#endif
+
+  // Call stubs are used to call Java from C
+  //
+  // Arguments:
+  //    c_rarg0:   call wrapper address                   address
+  //    c_rarg1:   result                                 address
+  //    c_rarg2:   result type                            BasicType
+  //    c_rarg3:   method                                 Method*
+  //    c_rarg4:   (interpreter) entry point              address
+  //    c_rarg5:   parameters                             intptr_t*
+  //    c_rarg6:   parameter size (in words)              int
+  //    c_rarg7:   thread                                 Thread*
+  //
+  // There is no return from the stub itself as any Java result
+  // is written to result
+  //
+  // we save r30 (lr) as the return PC at the base of the frame and
+  // link r29 (fp) below it as the frame pointer installing sp (r31)
+  // into fp.
+  //
+  // we save r0-r7, which accounts for all the c arguments.
+  //
+  // TODO: strictly do we need to save them all? they are treated as
+  // volatile by C so could we omit saving the ones we are going to
+  // place in global registers (thread? method?) or those we only use
+  // during setup of the Java call?
+  //
+  // we don't need to save r8 which C uses as an indirect result location
+  // return register.
+  //
+  // we don't need to save r9-r15 which both C and Java treat as
+  // volatile
+  //
+  // we don't need to save r16-18 because Java does not use them
+  //
+  // we save r19-r28 which Java uses as scratch registers and C
+  // expects to be callee-save
+  //
+  // we don't save any FP registers since only v8-v15 are callee-save
+  // (strictly only the f and d components) and Java uses them as
+  // callee-save. v0-v7 are arg registers and C treats v16-v31 as
+  // volatile (as does Java?)
+  //
+  // so the stub frame looks like this when we enter Java code
+  //
+  //     [ return_from_Java     ] <--- sp
+  //     [ argument word n      ]
+  //      ...
+  // -27 [ argument word 1      ]
+  // -26 [ saved d15            ] <--- sp_after_call
+  // -25 [ saved d14            ]
+  // -24 [ saved d13            ]
+  // -23 [ saved d12            ]
+  // -22 [ saved d11            ]
+  // -21 [ saved d10            ]
+  // -20 [ saved d9             ]
+  // -19 [ saved d8             ]
+  // -18 [ saved r28            ]
+  // -17 [ saved r27            ]
+  // -16 [ saved r26            ]
+  // -15 [ saved r25            ]
+  // -14 [ saved r24            ]
+  // -13 [ saved r23            ]
+  // -12 [ saved r22            ]
+  // -11 [ saved r21            ]
+  // -10 [ saved r20            ]
+  //  -9 [ saved r19            ]
+  //  -8 [ call wrapper    (r0) ]
+  //  -7 [ result          (r1) ]
+  //  -6 [ result type     (r2) ]
+  //  -5 [ method          (r3) ]
+  //  -4 [ entry point     (r4) ]
+  //  -3 [ parameters      (r5) ]
+  //  -2 [ parameter size  (r6) ]
+  //  -1 [ thread (r7)          ]
+  //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
+  //   1 [ saved lr       (r30) ]
+
+  // Call stub stack layout word offsets from fp
+  enum call_stub_layout {
+    sp_after_call_off = -26,
+
+    d15_off            = -26,
+    d14_off            = -25,
+    d13_off            = -24,
+    d12_off            = -23,
+    d11_off            = -22,
+    d10_off            = -21,
+    d9_off             = -20,
+    d8_off             = -19,
+
+    r28_off            = -18,
+    r27_off            = -17,
+    r26_off            = -16,
+    r25_off            = -15,
+    r24_off            = -14,
+    r23_off            = -13,
+    r22_off            = -12,
+    r21_off            = -11,
+    r20_off            = -10,
+    r19_off            =  -9,
+    call_wrapper_off   =  -8,
+    result_off         =  -7,
+    result_type_off    =  -6,
+    method_off         =  -5,
+    entry_point_off    =  -4,
+    parameters_off     =  -3,
+    parameter_size_off =  -2,
+    thread_off         =  -1,
+    fp_f               =   0,
+    retaddr_off        =   1,
+  };
+
+  address generate_call_stub(address& return_address) {
+    assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
+           (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
+           "adjust this code");
+
+    StubCodeMark mark(this, "StubRoutines", "call_stub");
+    address start = __ pc();
+
+    const Address sp_after_call(rfp, sp_after_call_off * wordSize);
+
+    const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
+    const Address result        (rfp, result_off         * wordSize);
+    const Address result_type   (rfp, result_type_off    * wordSize);
+    const Address method        (rfp, method_off         * wordSize);
+    const Address entry_point   (rfp, entry_point_off    * wordSize);
+    const Address parameters    (rfp, parameters_off     * wordSize);
+    const Address parameter_size(rfp, parameter_size_off * wordSize);
+
+    const Address thread        (rfp, thread_off         * wordSize);
+
+    const Address d15_save      (rfp, d15_off * wordSize);
+    const Address d14_save      (rfp, d14_off * wordSize);
+    const Address d13_save      (rfp, d13_off * wordSize);
+    const Address d12_save      (rfp, d12_off * wordSize);
+    const Address d11_save      (rfp, d11_off * wordSize);
+    const Address d10_save      (rfp, d10_off * wordSize);
+    const Address d9_save       (rfp, d9_off * wordSize);
+    const Address d8_save       (rfp, d8_off * wordSize);
+
+    const Address r28_save      (rfp, r28_off * wordSize);
+    const Address r27_save      (rfp, r27_off * wordSize);
+    const Address r26_save      (rfp, r26_off * wordSize);
+    const Address r25_save      (rfp, r25_off * wordSize);
+    const Address r24_save      (rfp, r24_off * wordSize);
+    const Address r23_save      (rfp, r23_off * wordSize);
+    const Address r22_save      (rfp, r22_off * wordSize);
+    const Address r21_save      (rfp, r21_off * wordSize);
+    const Address r20_save      (rfp, r20_off * wordSize);
+    const Address r19_save      (rfp, r19_off * wordSize);
+
+    // stub code
+
+    // we need a C prolog to bootstrap the x86 caller into the sim
+    __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
+
+    address aarch64_entry = __ pc();
+
+#ifdef BUILTIN_SIM
+    // Save sender's SP for stack traces.
+    __ mov(rscratch1, sp);
+    __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
+#endif
+    // set up frame and move sp to end of save area
+    __ enter();
+    __ sub(sp, rfp, -sp_after_call_off * wordSize);
+
+    // save register parameters and Java scratch/global registers
+    // n.b. we save thread even though it gets installed in
+    // rthread because we want to sanity check rthread later
+    __ str(c_rarg7,  thread);
+    __ strw(c_rarg6, parameter_size);
+    __ str(c_rarg5,  parameters);
+    __ str(c_rarg4,  entry_point);
+    __ str(c_rarg3,  method);
+    __ str(c_rarg2,  result_type);
+    __ str(c_rarg1,  result);
+    __ str(c_rarg0,  call_wrapper);
+    __ str(r19,      r19_save);
+    __ str(r20,      r20_save);
+    __ str(r21,      r21_save);
+    __ str(r22,      r22_save);
+    __ str(r23,      r23_save);
+    __ str(r24,      r24_save);
+    __ str(r25,      r25_save);
+    __ str(r26,      r26_save);
+    __ str(r27,      r27_save);
+    __ str(r28,      r28_save);
+
+    __ strd(v8,      d8_save);
+    __ strd(v9,      d9_save);
+    __ strd(v10,     d10_save);
+    __ strd(v11,     d11_save);
+    __ strd(v12,     d12_save);
+    __ strd(v13,     d13_save);
+    __ strd(v14,     d14_save);
+    __ strd(v15,     d15_save);
+
+    // install Java thread in global register now we have saved
+    // whatever value it held
+    __ mov(rthread, c_rarg7);
+    // And method
+    __ mov(rmethod, c_rarg3);
+
+    // set up the heapbase register
+    __ reinit_heapbase();
+
+#ifdef ASSERT
+    // make sure we have no pending exceptions
+    {
+      Label L;
+      __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
+      __ cmp(rscratch1, (unsigned)NULL_WORD);
+      __ br(Assembler::EQ, L);
+      __ stop("StubRoutines::call_stub: entered with pending exception");
+      __ BIND(L);
+    }
+#endif
+    // pass parameters if any
+    __ mov(esp, sp);
+    __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
+    __ andr(sp, rscratch1, -2 * wordSize);
+
+    BLOCK_COMMENT("pass parameters if any");
+    Label parameters_done;
+    // parameter count is still in c_rarg6
+    // and parameter pointer identifying param 1 is in c_rarg5
+    __ cbzw(c_rarg6, parameters_done);
+
+    address loop = __ pc();
+    __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
+    __ subsw(c_rarg6, c_rarg6, 1);
+    __ push(rscratch1);
+    __ br(Assembler::GT, loop);
+
+    __ BIND(parameters_done);
+
+    // call Java entry -- passing methdoOop, and current sp
+    //      rmethod: Method*
+    //      r13: sender sp
+    BLOCK_COMMENT("call Java function");
+    __ mov(r13, sp);
+    __ blr(c_rarg4);
+
+    // tell the simulator we have returned to the stub
+
+    // we do this here because the notify will already have been done
+    // if we get to the next instruction via an exception
+    //
+    // n.b. adding this instruction here affects the calculation of
+    // whether or not a routine returns to the call stub (used when
+    // doing stack walks) since the normal test is to check the return
+    // pc against the address saved below. so we may need to allow for
+    // this extra instruction in the check.
+
+    if (NotifySimulator) {
+      __ notify(Assembler::method_reentry);
+    }
+    // save current address for use by exception handling code
+
+    return_address = __ pc();
+
+    // store result depending on type (everything that is not
+    // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
+    // n.b. this assumes Java returns an integral result in r0
+    // and a floating result in j_farg0
+    __ ldr(j_rarg2, result);
+    Label is_long, is_float, is_double, exit;
+    __ ldr(j_rarg1, result_type);
+    __ cmp(j_rarg1, T_OBJECT);
+    __ br(Assembler::EQ, is_long);
+    __ cmp(j_rarg1, T_LONG);
+    __ br(Assembler::EQ, is_long);
+    __ cmp(j_rarg1, T_FLOAT);
+    __ br(Assembler::EQ, is_float);
+    __ cmp(j_rarg1, T_DOUBLE);
+    __ br(Assembler::EQ, is_double);
+
+    // handle T_INT case
+    __ strw(r0, Address(j_rarg2));
+
+    __ BIND(exit);
+
+    // pop parameters
+    __ sub(esp, rfp, -sp_after_call_off * wordSize);
+
+#ifdef ASSERT
+    // verify that threads correspond
+    {
+      Label L, S;
+      __ ldr(rscratch1, thread);
+      __ cmp(rthread, rscratch1);
+      __ br(Assembler::NE, S);
+      __ get_thread(rscratch1);
+      __ cmp(rthread, rscratch1);
+      __ br(Assembler::EQ, L);
+      __ BIND(S);
+      __ stop("StubRoutines::call_stub: threads must correspond");
+      __ BIND(L);
+    }
+#endif
+
+    // restore callee-save registers
+    __ ldrd(v15,      d15_save);
+    __ ldrd(v14,      d14_save);
+    __ ldrd(v13,      d13_save);
+    __ ldrd(v12,      d12_save);
+    __ ldrd(v11,      d11_save);
+    __ ldrd(v10,      d10_save);
+    __ ldrd(v9,       d9_save);
+    __ ldrd(v8,       d8_save);
+
+    __ ldr(r28,      r28_save);
+    __ ldr(r27,      r27_save);
+    __ ldr(r26,      r26_save);
+    __ ldr(r25,      r25_save);
+    __ ldr(r24,      r24_save);
+    __ ldr(r23,      r23_save);
+    __ ldr(r22,      r22_save);
+    __ ldr(r21,      r21_save);
+    __ ldr(r20,      r20_save);
+    __ ldr(r19,      r19_save);
+    __ ldr(c_rarg0,  call_wrapper);
+    __ ldr(c_rarg1,  result);
+    __ ldrw(c_rarg2, result_type);
+    __ ldr(c_rarg3,  method);
+    __ ldr(c_rarg4,  entry_point);
+    __ ldr(c_rarg5,  parameters);
+    __ ldr(c_rarg6,  parameter_size);
+    __ ldr(c_rarg7,  thread);
+
+#ifndef PRODUCT
+    // tell the simulator we are about to end Java execution
+    if (NotifySimulator) {
+      __ notify(Assembler::method_exit);
+    }
+#endif
+    // leave frame and return to caller
+    __ leave();
+    __ ret(lr);
+
+    // handle return types different from T_INT
+
+    __ BIND(is_long);
+    __ str(r0, Address(j_rarg2, 0));
+    __ br(Assembler::AL, exit);
+
+    __ BIND(is_float);
+    __ strs(j_farg0, Address(j_rarg2, 0));
+    __ br(Assembler::AL, exit);
+
+    __ BIND(is_double);
+    __ strd(j_farg0, Address(j_rarg2, 0));
+    __ br(Assembler::AL, exit);
+
+    return start;
+  }
+
+  // Return point for a Java call if there's an exception thrown in
+  // Java code.  The exception is caught and transformed into a
+  // pending exception stored in JavaThread that can be tested from
+  // within the VM.
+  //
+  // Note: Usually the parameters are removed by the callee. In case
+  // of an exception crossing an activation frame boundary, that is
+  // not the case if the callee is compiled code => need to setup the
+  // rsp.
+  //
+  // r0: exception oop
+
+  // NOTE: this is used as a target from the signal handler so it
+  // needs an x86 prolog which returns into the current simulator
+  // executing the generated catch_exception code. so the prolog
+  // needs to install rax in a sim register and adjust the sim's
+  // restart pc to enter the generated code at the start position
+  // then return from native to simulated execution.
+
+  address generate_catch_exception() {
+    StubCodeMark mark(this, "StubRoutines", "catch_exception");
+    address start = __ pc();
+
+    // same as in generate_call_stub():
+    const Address sp_after_call(rfp, sp_after_call_off * wordSize);
+    const Address thread        (rfp, thread_off         * wordSize);
+
+#ifdef ASSERT
+    // verify that threads correspond
+    {
+      Label L, S;
+      __ ldr(rscratch1, thread);
+      __ cmp(rthread, rscratch1);
+      __ br(Assembler::NE, S);
+      __ get_thread(rscratch1);
+      __ cmp(rthread, rscratch1);
+      __ br(Assembler::EQ, L);
+      __ bind(S);
+      __ stop("StubRoutines::catch_exception: threads must correspond");
+      __ bind(L);
+    }
+#endif
+
+    // set pending exception
+    __ verify_oop(r0);
+
+    __ str(r0, Address(rthread, Thread::pending_exception_offset()));
+    __ mov(rscratch1, (address)__FILE__);
+    __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
+    __ movw(rscratch1, (int)__LINE__);
+    __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
+
+    // complete return to VM
+    assert(StubRoutines::_call_stub_return_address != NULL,
+           "_call_stub_return_address must have been generated before");
+    __ b(StubRoutines::_call_stub_return_address);
+
+    return start;
+  }
+
+  // Continuation point for runtime calls returning with a pending
+  // exception.  The pending exception check happened in the runtime
+  // or native call stub.  The pending exception in Thread is
+  // converted into a Java-level exception.
+  //
+  // Contract with Java-level exception handlers:
+  // r0: exception
+  // r3: throwing pc
+  //
+  // NOTE: At entry of this stub, exception-pc must be in LR !!
+
+  // NOTE: this is always used as a jump target within generated code
+  // so it just needs to be generated code wiht no x86 prolog
+
+  address generate_forward_exception() {
+    StubCodeMark mark(this, "StubRoutines", "forward exception");
+    address start = __ pc();
+
+    // Upon entry, LR points to the return address returning into
+    // Java (interpreted or compiled) code; i.e., the return address
+    // becomes the throwing pc.
+    //
+    // Arguments pushed before the runtime call are still on the stack
+    // but the exception handler will reset the stack pointer ->
+    // ignore them.  A potential result in registers can be ignored as
+    // well.
+
+#ifdef ASSERT
+    // make sure this code is only executed if there is a pending exception
+    {
+      Label L;
+      __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+      __ cbnz(rscratch1, L);
+      __ stop("StubRoutines::forward exception: no pending exception (1)");
+      __ bind(L);
+    }
+#endif
+
+    // compute exception handler into r19
+
+    // call the VM to find the handler address associated with the
+    // caller address. pass thread in r0 and caller pc (ret address)
+    // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
+    // the stack.
+    __ mov(c_rarg1, lr);
+    // lr will be trashed by the VM call so we move it to R19
+    // (callee-saved) because we also need to pass it to the handler
+    // returned by this call.
+    __ mov(r19, lr);
+    BLOCK_COMMENT("call exception_handler_for_return_address");
+    __ call_VM_leaf(CAST_FROM_FN_PTR(address,
+                         SharedRuntime::exception_handler_for_return_address),
+                    rthread, c_rarg1);
+    // we should not really care that lr is no longer the callee
+    // address. we saved the value the handler needs in r19 so we can
+    // just copy it to r3. however, the C2 handler will push its own
+    // frame and then calls into the VM and the VM code asserts that
+    // the PC for the frame above the handler belongs to a compiled
+    // Java method. So, we restore lr here to satisfy that assert.
+    __ mov(lr, r19);
+    // setup r0 & r3 & clear pending exception
+    __ mov(r3, r19);
+    __ mov(r19, r0);
+    __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
+    __ str(zr, Address(rthread, Thread::pending_exception_offset()));
+
+#ifdef ASSERT
+    // make sure exception is set
+    {
+      Label L;
+      __ cbnz(r0, L);
+      __ stop("StubRoutines::forward exception: no pending exception (2)");
+      __ bind(L);
+    }
+#endif
+
+    // continue at exception handler
+    // r0: exception
+    // r3: throwing pc
+    // r19: exception handler
+    __ verify_oop(r0);
+    __ br(r19);
+
+    return start;
+  }
+
+  // Non-destructive plausibility checks for oops
+  //
+  // Arguments:
+  //    r0: oop to verify
+  //    rscratch1: error message
+  //
+  // Stack after saving c_rarg3:
+  //    [tos + 0]: saved c_rarg3
+  //    [tos + 1]: saved c_rarg2
+  //    [tos + 2]: saved lr
+  //    [tos + 3]: saved rscratch2
+  //    [tos + 4]: saved r0
+  //    [tos + 5]: saved rscratch1
+  address generate_verify_oop() {
+
+    StubCodeMark mark(this, "StubRoutines", "verify_oop");
+    address start = __ pc();
+
+    Label exit, error;
+
+    // save c_rarg2 and c_rarg3
+    __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
+
+    // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
+    __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
+    __ ldr(c_rarg3, Address(c_rarg2));
+    __ add(c_rarg3, c_rarg3, 1);
+    __ str(c_rarg3, Address(c_rarg2));
+
+    // object is in r0
+    // make sure object is 'reasonable'
+    __ cbz(r0, exit); // if obj is NULL it is OK
+
+    // Check if the oop is in the right area of memory
+    __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
+    __ andr(c_rarg2, r0, c_rarg3);
+    __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
+
+    // Compare c_rarg2 and c_rarg3.  We don't use a compare
+    // instruction here because the flags register is live.
+    __ eor(c_rarg2, c_rarg2, c_rarg3);
+    __ cbnz(c_rarg2, error);
+
+    // make sure klass is 'reasonable', which is not zero.
+    __ load_klass(r0, r0);  // get klass
+    __ cbz(r0, error);      // if klass is NULL it is broken
+
+    // return if everything seems ok
+    __ bind(exit);
+
+    __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
+    __ ret(lr);
+
+    // handle errors
+    __ bind(error);
+    __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
+
+    __ push(RegSet::range(r0, r29), sp);
+    // debug(char* msg, int64_t pc, int64_t regs[])
+    __ mov(c_rarg0, rscratch1);      // pass address of error message
+    __ mov(c_rarg1, lr);             // pass return address
+    __ mov(c_rarg2, sp);             // pass address of regs on stack
+#ifndef PRODUCT
+    assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
+#endif
+    BLOCK_COMMENT("call MacroAssembler::debug");
+    __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
+    __ blrt(rscratch1, 3, 0, 1);
+
+    return start;
+  }
+
+  void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
+
+  // Generate code for an array write pre barrier
+  //
+  //     addr    -  starting address
+  //     count   -  element count
+  //     tmp     - scratch register
+  //
+  //     Destroy no registers!
+  //
+  void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
+    BarrierSet* bs = Universe::heap()->barrier_set();
+    switch (bs->kind()) {
+    case BarrierSet::G1SATBCT:
+    case BarrierSet::G1SATBCTLogging:
+      // With G1, don't generate the call if we statically know that the target in uninitialized
+      if (!dest_uninitialized) {
+        __ push(RegSet::range(r0, r29), sp);         // integer registers except lr & sp
+        if (count == c_rarg0) {
+          if (addr == c_rarg1) {
+            // exactly backwards!!
+            __ stp(c_rarg0, c_rarg1, __ pre(sp, -2 * wordSize));
+            __ ldp(c_rarg1, c_rarg0, __ post(sp, -2 * wordSize));
+          } else {
+            __ mov(c_rarg1, count);
+            __ mov(c_rarg0, addr);
+          }
+        } else {
+          __ mov(c_rarg0, addr);
+          __ mov(c_rarg1, count);
+        }
+        __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
+        __ pop(RegSet::range(r0, r29), sp);         // integer registers except lr & sp        }
+        break;
+      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableExtension:
+      case BarrierSet::ModRef:
+        break;
+      default:
+        ShouldNotReachHere();
+
+      }
+    }
+  }
+
+  //
+  // Generate code for an array write post barrier
+  //
+  //  Input:
+  //     start    - register containing starting address of destination array
+  //     end      - register containing ending address of destination array
+  //     scratch  - scratch register
+  //
+  //  The input registers are overwritten.
+  //  The ending address is inclusive.
+  void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
+    assert_different_registers(start, end, scratch);
+    BarrierSet* bs = Universe::heap()->barrier_set();
+    switch (bs->kind()) {
+      case BarrierSet::G1SATBCT:
+      case BarrierSet::G1SATBCTLogging:
+
+        {
+          __ push(RegSet::range(r0, r29), sp);         // integer registers except lr & sp
+          // must compute element count unless barrier set interface is changed (other platforms supply count)
+          assert_different_registers(start, end, scratch);
+          __ lea(scratch, Address(end, BytesPerHeapOop));
+          __ sub(scratch, scratch, start);               // subtract start to get #bytes
+          __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
+          __ mov(c_rarg0, start);
+          __ mov(c_rarg1, scratch);
+          __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
+          __ pop(RegSet::range(r0, r29), sp);         // integer registers except lr & sp        }
+        }
+        break;
+      case BarrierSet::CardTableModRef:
+      case BarrierSet::CardTableExtension:
+        {
+          CardTableModRefBS* ct = (CardTableModRefBS*)bs;
+          assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
+
+          Label L_loop;
+
+           __ lsr(start, start, CardTableModRefBS::card_shift);
+           __ lsr(end, end, CardTableModRefBS::card_shift);
+           __ sub(end, end, start); // number of bytes to copy
+
+          const Register count = end; // 'end' register contains bytes count now
+          __ mov(scratch, (address)ct->byte_map_base);
+          __ add(start, start, scratch);
+          __ BIND(L_loop);
+          __ strb(zr, Address(start, count));
+          __ subs(count, count, 1);
+          __ br(Assembler::HS, L_loop);
+        }
+        break;
+      default:
+        ShouldNotReachHere();
+
+    }
+  }
+
+  typedef enum {
+    copy_forwards = 1,
+    copy_backwards = -1
+  } copy_direction;
+
+  // Bulk copy of blocks of 8 words.
+  //
+  // count is a count of words.
+  //
+  // Precondition: count >= 2
+  //
+  // Postconditions:
+  //
+  // The least significant bit of count contains the remaining count
+  // of words to copy.  The rest of count is trash.
+  //
+  // s and d are adjusted to point to the remaining words to copy
+  //
+  void generate_copy_longs(Label &start, Register s, Register d, Register count,
+                           copy_direction direction) {
+    int unit = wordSize * direction;
+
+    int offset;
+    const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
+      t4 = r7, t5 = r10, t6 = r11, t7 = r12;
+
+    assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
+    assert_different_registers(s, d, count, rscratch1);
+
+    Label again, large, small;
+    __ align(6);
+    __ bind(start);
+    __ cmp(count, 8);
+    __ br(Assembler::LO, small);
+    if (direction == copy_forwards) {
+      __ sub(s, s, 2 * wordSize);
+      __ sub(d, d, 2 * wordSize);
+    }
+    __ subs(count, count, 16);
+    __ br(Assembler::GE, large);
+
+    // 8 <= count < 16 words.  Copy 8.
+    __ ldp(t0, t1, Address(s, 2 * unit));
+    __ ldp(t2, t3, Address(s, 4 * unit));
+    __ ldp(t4, t5, Address(s, 6 * unit));
+    __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
+
+    __ stp(t0, t1, Address(d, 2 * unit));
+    __ stp(t2, t3, Address(d, 4 * unit));
+    __ stp(t4, t5, Address(d, 6 * unit));
+    __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
+
+    if (direction == copy_forwards) {
+      __ add(s, s, 2 * wordSize);
+      __ add(d, d, 2 * wordSize);
+    }
+
+    {
+      Label L1, L2;
+      __ bind(small);
+      __ tbz(count, exact_log2(4), L1);
+      __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
+      __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
+      __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
+      __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
+      __ bind(L1);
+
+      __ tbz(count, 1, L2);
+      __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
+      __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
+      __ bind(L2);
+    }
+
+    __ ret(lr);
+
+    __ align(6);
+    __ bind(large);
+
+    // Fill 8 registers
+    __ ldp(t0, t1, Address(s, 2 * unit));
+    __ ldp(t2, t3, Address(s, 4 * unit));
+    __ ldp(t4, t5, Address(s, 6 * unit));
+    __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
+
+    __ bind(again);
+
+    if (direction == copy_forwards && PrefetchCopyIntervalInBytes > 0)
+      __ prfm(Address(s, PrefetchCopyIntervalInBytes), PLDL1KEEP);
+
+    __ stp(t0, t1, Address(d, 2 * unit));
+    __ ldp(t0, t1, Address(s, 2 * unit));
+    __ stp(t2, t3, Address(d, 4 * unit));
+    __ ldp(t2, t3, Address(s, 4 * unit));
+    __ stp(t4, t5, Address(d, 6 * unit));
+    __ ldp(t4, t5, Address(s, 6 * unit));
+    __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
+    __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
+
+    __ subs(count, count, 8);
+    __ br(Assembler::HS, again);
+
+    // Drain
+    __ stp(t0, t1, Address(d, 2 * unit));
+    __ stp(t2, t3, Address(d, 4 * unit));
+    __ stp(t4, t5, Address(d, 6 * unit));
+    __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
+
+    if (direction == copy_forwards) {
+      __ add(s, s, 2 * wordSize);
+      __ add(d, d, 2 * wordSize);
+    }
+
+    {
+      Label L1, L2;
+      __ tbz(count, exact_log2(4), L1);
+      __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
+      __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
+      __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
+      __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
+      __ bind(L1);
+
+      __ tbz(count, 1, L2);
+      __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
+      __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
+      __ bind(L2);
+    }
+
+    __ ret(lr);
+  }
+
+  // Small copy: less than 16 bytes.
+  //
+  // NB: Ignores all of the bits of count which represent more than 15
+  // bytes, so a caller doesn't have to mask them.
+
+  void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
+    bool is_backwards = step < 0;
+    size_t granularity = uabs(step);
+    int direction = is_backwards ? -1 : 1;
+    int unit = wordSize * direction;
+
+    Label Lpair, Lword, Lint, Lshort, Lbyte;
+
+    assert(granularity
+           && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
+
+    const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
+
+    // ??? I don't know if this bit-test-and-branch is the right thing
+    // to do.  It does a lot of jumping, resulting in several
+    // mispredicted branches.  It might make more sense to do this
+    // with something like Duff's device with a single computed branch.
+
+    __ tbz(count, 3 - exact_log2(granularity), Lword);
+    __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
+    __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
+    __ bind(Lword);
+
+    if (granularity <= sizeof (jint)) {
+      __ tbz(count, 2 - exact_log2(granularity), Lint);
+      __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
+      __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
+      __ bind(Lint);
+    }
+
+    if (granularity <= sizeof (jshort)) {
+      __ tbz(count, 1 - exact_log2(granularity), Lshort);
+      __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
+      __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
+      __ bind(Lshort);
+    }
+
+    if (granularity <= sizeof (jbyte)) {
+      __ tbz(count, 0, Lbyte);
+      __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
+      __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
+      __ bind(Lbyte);
+    }
+  }
+
+  Label copy_f, copy_b;
+
+  // All-singing all-dancing memory copy.
+  //
+  // Copy count units of memory from s to d.  The size of a unit is
+  // step, which can be positive or negative depending on the direction
+  // of copy.  If is_aligned is false, we align the source address.
+  //
+
+  void copy_memory(bool is_aligned, Register s, Register d,
+                   Register count, Register tmp, int step) {
+    copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
+    bool is_backwards = step < 0;
+    int granularity = uabs(step);
+    const Register t0 = r3, t1 = r4;
+
+    if (is_backwards) {
+      __ lea(s, Address(s, count, Address::uxtw(exact_log2(-step))));
+      __ lea(d, Address(d, count, Address::uxtw(exact_log2(-step))));
+    }
+
+    Label done, tail;
+
+    __ cmp(count, 16/granularity);
+    __ br(Assembler::LO, tail);
+
+    // Now we've got the small case out of the way we can align the
+    // source address on a 2-word boundary.
+
+    Label aligned;
+
+    if (is_aligned) {
+      // We may have to adjust by 1 word to get s 2-word-aligned.
+      __ tbz(s, exact_log2(wordSize), aligned);
+      __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
+      __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
+      __ sub(count, count, wordSize/granularity);
+    } else {
+      if (is_backwards) {
+        __ andr(rscratch2, s, 2 * wordSize - 1);
+      } else {
+        __ neg(rscratch2, s);
+        __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
+      }
+      // rscratch2 is the byte adjustment needed to align s.
+      __ cbz(rscratch2, aligned);
+      __ lsr(rscratch2, rscratch2, exact_log2(granularity));
+      __ sub(count, count, rscratch2);
+
+#if 0
+      // ?? This code is only correct for a disjoint copy.  It may or
+      // may not make sense to use it in that case.
+
+      // Copy the first pair; s and d may not be aligned.
+      __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
+      __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
+
+      // Align s and d, adjust count
+      if (is_backwards) {
+        __ sub(s, s, rscratch2);
+        __ sub(d, d, rscratch2);
+      } else {
+        __ add(s, s, rscratch2);
+        __ add(d, d, rscratch2);
+      }
+#else
+      copy_memory_small(s, d, rscratch2, rscratch1, step);
+#endif
+    }
+
+    __ cmp(count, 16/granularity);
+    __ br(Assembler::LT, tail);
+    __ bind(aligned);
+
+    // s is now 2-word-aligned.
+
+    // We have a count of units and some trailing bytes.  Adjust the
+    // count and do a bulk copy of words.
+    __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
+    if (direction == copy_forwards)
+      __ bl(copy_f);
+    else
+      __ bl(copy_b);
+
+    // And the tail.
+
+    __ bind(tail);
+    copy_memory_small(s, d, count, tmp, step);
+  }
+
+
+  void clobber_registers() {
+#ifdef ASSERT
+    __ mov(rscratch1, (uint64_t)0xdeadbeef);
+    __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
+    for (Register r = r3; r <= r18; r++)
+      if (r != rscratch1) __ mov(r, rscratch1);
+#endif
+  }
+
+  // Scan over array at a for count oops, verifying each one.
+  // Preserves a and count, clobbers rscratch1 and rscratch2.
+  void verify_oop_array (size_t size, Register a, Register count, Register temp) {
+    Label loop, end;
+    __ mov(rscratch1, a);
+    __ mov(rscratch2, zr);
+    __ bind(loop);
+    __ cmp(rscratch2, count);
+    __ br(Assembler::HS, end);
+    if (size == (size_t)wordSize) {
+      __ ldr(temp, Address(a, rscratch2, Address::uxtw(exact_log2(size))));
+      __ verify_oop(temp);
+    } else {
+      __ ldrw(r16, Address(a, rscratch2, Address::uxtw(exact_log2(size))));
+      __ decode_heap_oop(temp); // calls verify_oop
+    }
+    __ add(rscratch2, rscratch2, size);
+    __ b(loop);
+    __ bind(end);
+  }
+
+  // Arguments:
+  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
+  //             ignored
+  //   is_oop  - true => oop array, so generate store check code
+  //   name    - stub name string
+  //
+  // Inputs:
+  //   c_rarg0   - source array address
+  //   c_rarg1   - destination array address
+  //   c_rarg2   - element count, treated as ssize_t, can be zero
+  //
+  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
+  // the hardware handle it.  The two dwords within qwords that span
+  // cache line boundaries will still be loaded and stored atomicly.
+  //
+  // Side Effects:
+  //   disjoint_int_copy_entry is set to the no-overlap entry point
+  //   used by generate_conjoint_int_oop_copy().
+  //
+  address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
+                                  const char *name, bool dest_uninitialized = false) {
+    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+    if (entry != NULL) {
+      *entry = __ pc();
+      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
+      BLOCK_COMMENT("Entry:");
+    }
+    __ enter();
+    if (is_oop) {
+      __ push(RegSet::of(d, count), sp);
+      // no registers are destroyed by this call
+      gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
+    }
+    copy_memory(aligned, s, d, count, rscratch1, size);
+    if (is_oop) {
+      __ pop(RegSet::of(d, count), sp);
+      if (VerifyOops)
+        verify_oop_array(size, d, count, r16);
+      __ sub(count, count, 1); // make an inclusive end pointer
+      __ lea(count, Address(d, count, Address::uxtw(exact_log2(size))));
+      gen_write_ref_array_post_barrier(d, count, rscratch1);
+    }
+    __ leave();
+    __ ret(lr);
+#ifdef BUILTIN_SIM
+    {
+      AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
+      sim->notifyCompile(const_cast<char*>(name), start);
+    }
+#endif
+    return start;
+  }
+
+  // Arguments:
+  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
+  //             ignored
+  //   is_oop  - true => oop array, so generate store check code
+  //   name    - stub name string
+  //
+  // Inputs:
+  //   c_rarg0   - source array address
+  //   c_rarg1   - destination array address
+  //   c_rarg2   - element count, treated as ssize_t, can be zero
+  //
+  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
+  // the hardware handle it.  The two dwords within qwords that span
+  // cache line boundaries will still be loaded and stored atomicly.
+  //
+  address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
+                                 address *entry, const char *name,
+                                 bool dest_uninitialized = false) {
+    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
+
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    __ cmp(d, s);
+    __ br(Assembler::LS, nooverlap_target);
+
+    __ enter();
+    if (is_oop) {
+      __ push(RegSet::of(d, count), sp);
+      // no registers are destroyed by this call
+      gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
+    }
+    copy_memory(aligned, s, d, count, rscratch1, -size);
+    if (is_oop) {
+      __ pop(RegSet::of(d, count), sp);
+      if (VerifyOops)
+        verify_oop_array(size, d, count, r16);
+      __ sub(count, count, 1); // make an inclusive end pointer
+      __ lea(count, Address(d, count, Address::uxtw(exact_log2(size))));
+      gen_write_ref_array_post_barrier(d, count, rscratch1);
+    }
+    __ leave();
+    __ ret(lr);
+#ifdef BUILTIN_SIM
+    {
+      AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
+      sim->notifyCompile(const_cast<char*>(name), start);
+    }
+#endif
+    return start;
+}
+
+  // Arguments:
+  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
+  //             ignored
+  //   name    - stub name string
+  //
+  // Inputs:
+  //   c_rarg0   - source array address
+  //   c_rarg1   - destination array address
+  //   c_rarg2   - element count, treated as ssize_t, can be zero
+  //
+  // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
+  // we let the hardware handle it.  The one to eight bytes within words,
+  // dwords or qwords that span cache line boundaries will still be loaded
+  // and stored atomically.
+  //
+  // Side Effects:
+  //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
+  // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
+  // we let the hardware handle it.  The one to eight bytes within words,
+  // dwords or qwords that span cache line boundaries will still be loaded
+  // and stored atomically.
+  //
+  // Side Effects:
+  //   disjoint_byte_copy_entry is set to the no-overlap entry point
+  //   used by generate_conjoint_byte_copy().
+  //
+  address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
+    const bool not_oop = false;
+    return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
+  }
+
+  // Arguments:
+  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
+  //             ignored
+  //   name    - stub name string
+  //
+  // Inputs:
+  //   c_rarg0   - source array address
+  //   c_rarg1   - destination array address
+  //   c_rarg2   - element count, treated as ssize_t, can be zero
+  //
+  // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
+  // we let the hardware handle it.  The one to eight bytes within words,
+  // dwords or qwords that span cache line boundaries will still be loaded
+  // and stored atomically.
+  //
+  address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
+                                      address* entry, const char *name) {
+    const bool not_oop = false;
+    return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
+  }
+
+  // Arguments:
+  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
+  //             ignored
+  //   name    - stub name string
+  //
+  // Inputs:
+  //   c_rarg0   - source array address
+  //   c_rarg1   - destination array address
+  //   c_rarg2   - element count, treated as ssize_t, can be zero
+  //
+  // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
+  // let the hardware handle it.  The two or four words within dwords
+  // or qwords that span cache line boundaries will still be loaded
+  // and stored atomically.
+  //
+  // Side Effects:
+  //   disjoint_short_copy_entry is set to the no-overlap entry point
+  //   used by generate_conjoint_short_copy().
+  //
+  address generate_disjoint_short_copy(bool aligned,
+                                       address* entry, const char *name) {
+    const bool not_oop = false;
+    return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
+  }
+
+  // Arguments:
+  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
+  //             ignored
+  //   name    - stub name string
+  //
+  // Inputs:
+  //   c_rarg0   - source array address
+  //   c_rarg1   - destination array address
+  //   c_rarg2   - element count, treated as ssize_t, can be zero
+  //
+  // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
+  // let the hardware handle it.  The two or four words within dwords
+  // or qwords that span cache line boundaries will still be loaded
+  // and stored atomically.
+  //
+  address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
+                                       address *entry, const char *name) {
+    const bool not_oop = false;
+    return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
+
+  }
+  // Arguments:
+  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
+  //             ignored
+  //   name    - stub name string
+  //
+  // Inputs:
+  //   c_rarg0   - source array address
+  //   c_rarg1   - destination array address
+  //   c_rarg2   - element count, treated as ssize_t, can be zero
+  //
+  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
+  // the hardware handle it.  The two dwords within qwords that span
+  // cache line boundaries will still be loaded and stored atomicly.
+  //
+  // Side Effects:
+  //   disjoint_int_copy_entry is set to the no-overlap entry point
+  //   used by generate_conjoint_int_oop_copy().
+  //
+  address generate_disjoint_int_copy(bool aligned, address *entry,
+                                         const char *name, bool dest_uninitialized = false) {
+    const bool not_oop = false;
+    return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
+  }
+
+  // Arguments:
+  //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
+  //             ignored
+  //   name    - stub name string
+  //
+  // Inputs:
+  //   c_rarg0   - source array address
+  //   c_rarg1   - destination array address
+  //   c_rarg2   - element count, treated as ssize_t, can be zero
+  //
+  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
+  // the hardware handle it.  The two dwords within qwords that span
+  // cache line boundaries will still be loaded and stored atomicly.
+  //
+  address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
+                                     address *entry, const char *name,
+                                     bool dest_uninitialized = false) {
+    const bool not_oop = false;
+    return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
+  }
+
+
+  // Arguments:
+  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
+  //             ignored
+  //   name    - stub name string
+  //
+  // Inputs:
+  //   c_rarg0   - source array address
+  //   c_rarg1   - destination array address
+  //   c_rarg2   - element count, treated as size_t, can be zero
+  //
+  // Side Effects:
+  //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
+  //   no-overlap entry point used by generate_conjoint_long_oop_copy().
+  //
+  address generate_disjoint_long_copy(bool aligned, address *entry,
+                                          const char *name, bool dest_uninitialized = false) {
+    const bool not_oop = false;
+    return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
+  }
+
+  // Arguments:
+  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
+  //             ignored
+  //   name    - stub name string
+  //
+  // Inputs:
+  //   c_rarg0   - source array address
+  //   c_rarg1   - destination array address
+  //   c_rarg2   - element count, treated as size_t, can be zero
+  //
+  address generate_conjoint_long_copy(bool aligned,
+                                      address nooverlap_target, address *entry,
+                                      const char *name, bool dest_uninitialized = false) {
+    const bool not_oop = false;
+    return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
+  }
+
+  // Arguments:
+  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
+  //             ignored
+  //   name    - stub name string
+  //
+  // Inputs:
+  //   c_rarg0   - source array address
+  //   c_rarg1   - destination array address
+  //   c_rarg2   - element count, treated as size_t, can be zero
+  //
+  // Side Effects:
+  //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
+  //   no-overlap entry point used by generate_conjoint_long_oop_copy().
+  //
+  address generate_disjoint_oop_copy(bool aligned, address *entry,
+                                     const char *name, bool dest_uninitialized = false) {
+    const bool is_oop = true;
+    const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
+    return generate_disjoint_copy(size, aligned, is_oop, entry, name);
+  }
+
+  // Arguments:
+  //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
+  //             ignored
+  //   name    - stub name string
+  //
+  // Inputs:
+  //   c_rarg0   - source array address
+  //   c_rarg1   - destination array address
+  //   c_rarg2   - element count, treated as size_t, can be zero
+  //
+  address generate_conjoint_oop_copy(bool aligned,
+                                     address nooverlap_target, address *entry,
+                                     const char *name, bool dest_uninitialized = false) {
+    const bool is_oop = true;
+    const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
+    return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, name);
+  }
+
+
+  // Helper for generating a dynamic type check.
+  // Smashes rscratch1.
+  void generate_type_check(Register sub_klass,
+                           Register super_check_offset,
+                           Register super_klass,
+                           Label& L_success) {
+    assert_different_registers(sub_klass, super_check_offset, super_klass);
+
+    BLOCK_COMMENT("type_check:");
+
+    Label L_miss;
+
+    __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
+                                     super_check_offset);
+    __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
+
+    // Fall through on failure!
+    __ BIND(L_miss);
+  }
+
+  //
+  //  Generate checkcasting array copy stub
+  //
+  //  Input:
+  //    c_rarg0   - source array address
+  //    c_rarg1   - destination array address
+  //    c_rarg2   - element count, treated as ssize_t, can be zero
+  //    c_rarg3   - size_t ckoff (super_check_offset)
+  //    c_rarg4   - oop ckval (super_klass)
+  //
+  //  Output:
+  //    r0 ==  0  -  success
+  //    r0 == -1^K - failure, where K is partial transfer count
+  //
+  address generate_checkcast_copy(const char *name, address *entry,
+                                  bool dest_uninitialized = false) {
+
+    Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
+
+    // Input registers (after setup_arg_regs)
+    const Register from        = c_rarg0;   // source array address
+    const Register to          = c_rarg1;   // destination array address
+    const Register count       = c_rarg2;   // elementscount
+    const Register ckoff       = c_rarg3;   // super_check_offset
+    const Register ckval       = c_rarg4;   // super_klass
+
+    // Registers used as temps (r18, r19, r20 are save-on-entry)
+    const Register count_save  = r21;       // orig elementscount
+    const Register start_to    = r20;       // destination array start address
+    const Register copied_oop  = r18;       // actual oop copied
+    const Register r19_klass   = r19;       // oop._klass
+
+    //---------------------------------------------------------------
+    // Assembler stub will be used for this call to arraycopy
+    // if the two arrays are subtypes of Object[] but the
+    // destination array type is not equal to or a supertype
+    // of the source type.  Each element must be separately
+    // checked.
+
+    assert_different_registers(from, to, count, ckoff, ckval, start_to,
+                               copied_oop, r19_klass, count_save);
+
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+#ifdef ASSERT
+    // caller guarantees that the arrays really are different
+    // otherwise, we would have to make conjoint checks
+    { Label L;
+      array_overlap_test(L, TIMES_OOP);
+      __ stop("checkcast_copy within a single array");
+      __ bind(L);
+    }
+#endif //ASSERT
+
+    // Caller of this entry point must set up the argument registers.
+    if (entry != NULL) {
+      *entry = __ pc();
+      BLOCK_COMMENT("Entry:");
+    }
+
+     // Empty array:  Nothing to do.
+    __ cbz(count, L_done);
+
+    __ push(RegSet::of(r18, r19, r20, r21), sp);
+
+#ifdef ASSERT
+    BLOCK_COMMENT("assert consistent ckoff/ckval");
+    // The ckoff and ckval must be mutually consistent,
+    // even though caller generates both.
+    { Label L;
+      int sco_offset = in_bytes(Klass::super_check_offset_offset());
+      __ ldrw(start_to, Address(ckval, sco_offset));
+      __ cmpw(ckoff, start_to);
+      __ br(Assembler::EQ, L);
+      __ stop("super_check_offset inconsistent");
+      __ bind(L);
+    }
+#endif //ASSERT
+
+    // save the original count
+    __ mov(count_save, count);
+
+    // Copy from low to high addresses
+    __ mov(start_to, to);              // Save destination array start address
+    __ b(L_load_element);
+
+    // ======== begin loop ========
+    // (Loop is rotated; its entry is L_load_element.)
+    // Loop control:
+    //   for (; count != 0; count--) {
+    //     copied_oop = load_heap_oop(from++);
+    //     ... generate_type_check ...;
+    //     store_heap_oop(to++, copied_oop);
+    //   }
+    __ align(OptoLoopAlignment);
+
+    __ BIND(L_store_element);
+    __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
+    __ sub(count, count, 1);
+    __ cbz(count, L_do_card_marks);
+
+    // ======== loop entry is here ========
+    __ BIND(L_load_element);
+    __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
+    __ cbz(copied_oop, L_store_element);
+
+    __ load_klass(r19_klass, copied_oop);// query the object klass
+    generate_type_check(r19_klass, ckoff, ckval, L_store_element);
+    // ======== end loop ========
+
+    // It was a real error; we must depend on the caller to finish the job.
+    // Register count = remaining oops, count_orig = total oops.
+    // Emit GC store barriers for the oops we have copied and report
+    // their number to the caller.
+
+    __ subs(count, count_save, count);     // K = partially copied oop count
+    __ eon(count, count, zr);                   // report (-1^K) to caller
+    __ br(Assembler::EQ, L_done_pop);
+
+    __ BIND(L_do_card_marks);
+    __ add(to, to, -heapOopSize);         // make an inclusive end pointer
+    gen_write_ref_array_post_barrier(start_to, to, rscratch1);
+
+    __ bind(L_done_pop);
+    __ pop(RegSet::of(r18, r19, r20, r21), sp);
+    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
+
+    __ bind(L_done);
+    __ mov(r0, count);
+    __ leave();
+    __ ret(lr);
+
+    return start;
+  }
+
+  // Perform range checks on the proposed arraycopy.
+  // Kills temp, but nothing else.
+  // Also, clean the sign bits of src_pos and dst_pos.
+  void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
+                              Register src_pos, // source position (c_rarg1)
+                              Register dst,     // destination array oo (c_rarg2)
+                              Register dst_pos, // destination position (c_rarg3)
+                              Register length,
+                              Register temp,
+                              Label& L_failed) { Unimplemented(); }
+
+  // These stubs get called from some dumb test routine.
+  // I'll write them properly when they're called from
+  // something that's actually doing something.
+  static void fake_arraycopy_stub(address src, address dst, int count) {
+    assert(count == 0, "huh?");
+  }
+
+
+  void generate_arraycopy_stubs() {
+    address entry;
+    address entry_jbyte_arraycopy;
+    address entry_jshort_arraycopy;
+    address entry_jint_arraycopy;
+    address entry_oop_arraycopy;
+    address entry_jlong_arraycopy;
+    address entry_checkcast_arraycopy;
+
+    generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
+    generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
+
+    //*** jbyte
+    // Always need aligned and unaligned versions
+    StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
+                                                                                  "jbyte_disjoint_arraycopy");
+    StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
+                                                                                  &entry_jbyte_arraycopy,
+                                                                                  "jbyte_arraycopy");
+    StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
+                                                                                  "arrayof_jbyte_disjoint_arraycopy");
+    StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
+                                                                                  "arrayof_jbyte_arraycopy");
+
+    //*** jshort
+    // Always need aligned and unaligned versions
+    StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
+                                                                                    "jshort_disjoint_arraycopy");
+    StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
+                                                                                    &entry_jshort_arraycopy,
+                                                                                    "jshort_arraycopy");
+    StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
+                                                                                    "arrayof_jshort_disjoint_arraycopy");
+    StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
+                                                                                    "arrayof_jshort_arraycopy");
+
+    //*** jint
+    // Aligned versions
+    StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
+                                                                                "arrayof_jint_disjoint_arraycopy");
+    StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
+                                                                                "arrayof_jint_arraycopy");
+    // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
+    // entry_jint_arraycopy always points to the unaligned version
+    StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
+                                                                                "jint_disjoint_arraycopy");
+    StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
+                                                                                &entry_jint_arraycopy,
+                                                                                "jint_arraycopy");
+
+    //*** jlong
+    // It is always aligned
+    StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
+                                                                                  "arrayof_jlong_disjoint_arraycopy");
+    StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
+                                                                                  "arrayof_jlong_arraycopy");
+    StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
+    StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
+
+    //*** oops
+    {
+      // With compressed oops we need unaligned versions; notice that
+      // we overwrite entry_oop_arraycopy.
+      bool aligned = !UseCompressedOops;
+
+      StubRoutines::_arrayof_oop_disjoint_arraycopy
+        = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy");
+      StubRoutines::_arrayof_oop_arraycopy
+        = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy");
+      // Aligned versions without pre-barriers
+      StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
+        = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
+                                     /*dest_uninitialized*/true);
+      StubRoutines::_arrayof_oop_arraycopy_uninit
+        = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
+                                     /*dest_uninitialized*/true);
+    }
+
+    StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
+    StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
+    StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
+    StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
+
+    StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
+    StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
+                                                                        /*dest_uninitialized*/true);
+  }
+
+  void generate_math_stubs() { Unimplemented(); }
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //
+  address generate_aescrypt_encryptBlock() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
+
+    Label L_doLast;
+
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register keylen      = rscratch1;
+
+    address start = __ pc();
+    __ enter();
+
+    __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+    __ ld1(v0, __ T16B, from); // get 16 bytes of input
+
+    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+    __ rev32(v3, __ T16B, v3);
+    __ rev32(v4, __ T16B, v4);
+    __ aese(v0, v1);
+    __ aesmc(v0, v0);
+    __ aese(v0, v2);
+    __ aesmc(v0, v0);
+    __ aese(v0, v3);
+    __ aesmc(v0, v0);
+    __ aese(v0, v4);
+    __ aesmc(v0, v0);
+
+    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+    __ rev32(v3, __ T16B, v3);
+    __ rev32(v4, __ T16B, v4);
+    __ aese(v0, v1);
+    __ aesmc(v0, v0);
+    __ aese(v0, v2);
+    __ aesmc(v0, v0);
+    __ aese(v0, v3);
+    __ aesmc(v0, v0);
+    __ aese(v0, v4);
+    __ aesmc(v0, v0);
+
+    __ ld1(v1, v2, __ T16B, __ post(key, 32));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+
+    __ cmpw(keylen, 44);
+    __ br(Assembler::EQ, L_doLast);
+
+    __ aese(v0, v1);
+    __ aesmc(v0, v0);
+    __ aese(v0, v2);
+    __ aesmc(v0, v0);
+
+    __ ld1(v1, v2, __ T16B, __ post(key, 32));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+
+    __ cmpw(keylen, 52);
+    __ br(Assembler::EQ, L_doLast);
+
+    __ aese(v0, v1);
+    __ aesmc(v0, v0);
+    __ aese(v0, v2);
+    __ aesmc(v0, v0);
+
+    __ ld1(v1, v2, __ T16B, __ post(key, 32));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+
+    __ BIND(L_doLast);
+
+    __ aese(v0, v1);
+    __ aesmc(v0, v0);
+    __ aese(v0, v2);
+
+    __ ld1(v1, __ T16B, key);
+    __ rev32(v1, __ T16B, v1);
+    __ eor(v0, __ T16B, v0, v1);
+
+    __ st1(v0, __ T16B, to);
+
+    __ mov(r0, 0);
+
+    __ leave();
+    __ ret(lr);
+
+    return start;
+  }
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //
+  address generate_aescrypt_decryptBlock() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
+    Label L_doLast;
+
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register keylen      = rscratch1;
+
+    address start = __ pc();
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+    __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+    __ ld1(v0, __ T16B, from); // get 16 bytes of input
+
+    __ ld1(v5, __ T16B, __ post(key, 16));
+    __ rev32(v5, __ T16B, v5);
+
+    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+    __ rev32(v3, __ T16B, v3);
+    __ rev32(v4, __ T16B, v4);
+    __ aesd(v0, v1);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v2);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v3);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v4);
+    __ aesimc(v0, v0);
+
+    __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+    __ rev32(v3, __ T16B, v3);
+    __ rev32(v4, __ T16B, v4);
+    __ aesd(v0, v1);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v2);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v3);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v4);
+    __ aesimc(v0, v0);
+
+    __ ld1(v1, v2, __ T16B, __ post(key, 32));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+
+    __ cmpw(keylen, 44);
+    __ br(Assembler::EQ, L_doLast);
+
+    __ aesd(v0, v1);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v2);
+    __ aesimc(v0, v0);
+
+    __ ld1(v1, v2, __ T16B, __ post(key, 32));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+
+    __ cmpw(keylen, 52);
+    __ br(Assembler::EQ, L_doLast);
+
+    __ aesd(v0, v1);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v2);
+    __ aesimc(v0, v0);
+
+    __ ld1(v1, v2, __ T16B, __ post(key, 32));
+    __ rev32(v1, __ T16B, v1);
+    __ rev32(v2, __ T16B, v2);
+
+    __ BIND(L_doLast);
+
+    __ aesd(v0, v1);
+    __ aesimc(v0, v0);
+    __ aesd(v0, v2);
+
+    __ eor(v0, __ T16B, v0, v5);
+
+    __ st1(v0, __ T16B, to);
+
+    __ mov(r0, 0);
+
+    __ leave();
+    __ ret(lr);
+
+    return start;
+  }
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - r vector byte array address
+  //   c_rarg4   - input length
+  //
+  // Output:
+  //   x0        - input length
+  //
+  address generate_cipherBlockChaining_encryptAESCrypt() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
+
+    Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
+
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
+                                           // and left with the results of the last encryption block
+    const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
+    const Register keylen      = rscratch1;
+
+    address start = __ pc();
+      __ enter();
+
+      __ mov(rscratch1, len_reg);
+      __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+      __ ld1(v0, __ T16B, rvec);
+
+      __ cmpw(keylen, 52);
+      __ br(Assembler::CC, L_loadkeys_44);
+      __ br(Assembler::EQ, L_loadkeys_52);
+
+      __ ld1(v17, v18, __ T16B, __ post(key, 32));
+      __ rev32(v17, __ T16B, v17);
+      __ rev32(v18, __ T16B, v18);
+    __ BIND(L_loadkeys_52);
+      __ ld1(v19, v20, __ T16B, __ post(key, 32));
+      __ rev32(v19, __ T16B, v19);
+      __ rev32(v20, __ T16B, v20);
+    __ BIND(L_loadkeys_44);
+      __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
+      __ rev32(v21, __ T16B, v21);
+      __ rev32(v22, __ T16B, v22);
+      __ rev32(v23, __ T16B, v23);
+      __ rev32(v24, __ T16B, v24);
+      __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
+      __ rev32(v25, __ T16B, v25);
+      __ rev32(v26, __ T16B, v26);
+      __ rev32(v27, __ T16B, v27);
+      __ rev32(v28, __ T16B, v28);
+      __ ld1(v29, v30, v31, __ T16B, key);
+      __ rev32(v29, __ T16B, v29);
+      __ rev32(v30, __ T16B, v30);
+      __ rev32(v31, __ T16B, v31);
+
+    __ BIND(L_aes_loop);
+      __ ld1(v1, __ T16B, __ post(from, 16));
+      __ eor(v0, __ T16B, v0, v1);
+
+      __ br(Assembler::CC, L_rounds_44);
+      __ br(Assembler::EQ, L_rounds_52);
+
+      __ aese(v0, v17); __ aesmc(v0, v0);
+      __ aese(v0, v18); __ aesmc(v0, v0);
+    __ BIND(L_rounds_52);
+      __ aese(v0, v19); __ aesmc(v0, v0);
+      __ aese(v0, v20); __ aesmc(v0, v0);
+    __ BIND(L_rounds_44);
+      __ aese(v0, v21); __ aesmc(v0, v0);
+      __ aese(v0, v22); __ aesmc(v0, v0);
+      __ aese(v0, v23); __ aesmc(v0, v0);
+      __ aese(v0, v24); __ aesmc(v0, v0);
+      __ aese(v0, v25); __ aesmc(v0, v0);
+      __ aese(v0, v26); __ aesmc(v0, v0);
+      __ aese(v0, v27); __ aesmc(v0, v0);
+      __ aese(v0, v28); __ aesmc(v0, v0);
+      __ aese(v0, v29); __ aesmc(v0, v0);
+      __ aese(v0, v30);
+      __ eor(v0, __ T16B, v0, v31);
+
+      __ st1(v0, __ T16B, __ post(to, 16));
+      __ sub(len_reg, len_reg, 16);
+      __ cbnz(len_reg, L_aes_loop);
+
+      __ st1(v0, __ T16B, rvec);
+
+      __ mov(r0, rscratch2);
+
+      __ leave();
+      __ ret(lr);
+
+      return start;
+  }
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - r vector byte array address
+  //   c_rarg4   - input length
+  //
+  // Output:
+  //   rax       - input length
+  //
+  address generate_cipherBlockChaining_decryptAESCrypt() {
+    assert(UseAES, "need AES instructions and misaligned SSE support");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
+
+    Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
+
+    const Register from        = c_rarg0;  // source array address
+    const Register to          = c_rarg1;  // destination array address
+    const Register key         = c_rarg2;  // key array address
+    const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
+                                           // and left with the results of the last encryption block
+    const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
+    const Register keylen      = rscratch1;
+
+    address start = __ pc();
+      __ enter();
+
+      __ mov(rscratch2, len_reg);
+      __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+      __ ld1(v2, __ T16B, rvec);
+
+      __ ld1(v31, __ T16B, __ post(key, 16));
+      __ rev32(v31, __ T16B, v31);
+
+      __ cmpw(keylen, 52);
+      __ br(Assembler::CC, L_loadkeys_44);
+      __ br(Assembler::EQ, L_loadkeys_52);
+
+      __ ld1(v17, v18, __ T16B, __ post(key, 32));
+      __ rev32(v17, __ T16B, v17);
+      __ rev32(v18, __ T16B, v18);
+    __ BIND(L_loadkeys_52);
+      __ ld1(v19, v20, __ T16B, __ post(key, 32));
+      __ rev32(v19, __ T16B, v19);
+      __ rev32(v20, __ T16B, v20);
+    __ BIND(L_loadkeys_44);
+      __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
+      __ rev32(v21, __ T16B, v21);
+      __ rev32(v22, __ T16B, v22);
+      __ rev32(v23, __ T16B, v23);
+      __ rev32(v24, __ T16B, v24);
+      __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
+      __ rev32(v25, __ T16B, v25);
+      __ rev32(v26, __ T16B, v26);
+      __ rev32(v27, __ T16B, v27);
+      __ rev32(v28, __ T16B, v28);
+      __ ld1(v29, v30, __ T16B, key);
+      __ rev32(v29, __ T16B, v29);
+      __ rev32(v30, __ T16B, v30);
+
+    __ BIND(L_aes_loop);
+      __ ld1(v0, __ T16B, __ post(from, 16));
+      __ orr(v1, __ T16B, v0, v0);
+
+      __ br(Assembler::CC, L_rounds_44);
+      __ br(Assembler::EQ, L_rounds_52);
+
+      __ aesd(v0, v17); __ aesimc(v0, v0);
+      __ aesd(v0, v17); __ aesimc(v0, v0);
+    __ BIND(L_rounds_52);
+      __ aesd(v0, v19); __ aesimc(v0, v0);
+      __ aesd(v0, v20); __ aesimc(v0, v0);
+    __ BIND(L_rounds_44);
+      __ aesd(v0, v21); __ aesimc(v0, v0);
+      __ aesd(v0, v22); __ aesimc(v0, v0);
+      __ aesd(v0, v23); __ aesimc(v0, v0);
+      __ aesd(v0, v24); __ aesimc(v0, v0);
+      __ aesd(v0, v25); __ aesimc(v0, v0);
+      __ aesd(v0, v26); __ aesimc(v0, v0);
+      __ aesd(v0, v27); __ aesimc(v0, v0);
+      __ aesd(v0, v28); __ aesimc(v0, v0);
+      __ aesd(v0, v29); __ aesimc(v0, v0);
+      __ aesd(v0, v30);
+      __ eor(v0, __ T16B, v0, v31);
+      __ eor(v0, __ T16B, v0, v2);
+
+      __ st1(v0, __ T16B, __ post(to, 16));
+      __ orr(v2, __ T16B, v1, v1);
+
+      __ sub(len_reg, len_reg, 16);
+      __ cbnz(len_reg, L_aes_loop);
+
+      __ st1(v2, __ T16B, rvec);
+
+      __ mov(r0, rscratch2);
+
+      __ leave();
+      __ ret(lr);
+
+    return start;
+  }
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - byte[]  source+offset
+  //   c_rarg1   - int[]   SHA.state
+  //   c_rarg2   - int     offset
+  //   c_rarg3   - int     limit
+  //
+  address generate_sha1_implCompress(bool multi_block, const char *name) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    Register buf   = c_rarg0;
+    Register state = c_rarg1;
+    Register ofs   = c_rarg2;
+    Register limit = c_rarg3;
+
+    Label keys;
+    Label sha1_loop;
+
+    // load the keys into v0..v3
+    __ adr(rscratch1, keys);
+    __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
+    // load 5 words state into v6, v7
+    __ ldrq(v6, Address(state, 0));
+    __ ldrs(v7, Address(state, 16));
+
+
+    __ BIND(sha1_loop);
+    // load 64 bytes of data into v16..v19
+    __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
+    __ rev32(v16, __ T16B, v16);
+    __ rev32(v17, __ T16B, v17);
+    __ rev32(v18, __ T16B, v18);
+    __ rev32(v19, __ T16B, v19);
+
+    // do the sha1
+    __ addv(v4, __ T4S, v16, v0);
+    __ orr(v20, __ T16B, v6, v6);
+
+    FloatRegister d0 = v16;
+    FloatRegister d1 = v17;
+    FloatRegister d2 = v18;
+    FloatRegister d3 = v19;
+
+    for (int round = 0; round < 20; round++) {
+      FloatRegister tmp1 = (round & 1) ? v4 : v5;
+      FloatRegister tmp2 = (round & 1) ? v21 : v22;
+      FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
+      FloatRegister tmp4 = (round & 1) ? v5 : v4;
+      FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
+
+      if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
+      if (round < 19) __ addv(tmp1, __ T4S, d1, key);
+      __ sha1h(tmp2, __ T4S, v20);
+      if (round < 5)
+        __ sha1c(v20, __ T4S, tmp3, tmp4);
+      else if (round < 10 || round >= 15)
+        __ sha1p(v20, __ T4S, tmp3, tmp4);
+      else
+        __ sha1m(v20, __ T4S, tmp3, tmp4);
+      if (round < 16) __ sha1su1(d0, __ T4S, d3);
+
+      tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
+    }
+
+    __ addv(v7, __ T2S, v7, v21);
+    __ addv(v6, __ T4S, v6, v20);
+
+    if (multi_block) {
+      __ add(ofs, ofs, 64);
+      __ cmp(ofs, limit);
+      __ br(Assembler::LE, sha1_loop);
+      __ mov(c_rarg0, ofs); // return ofs
+    }
+
+    __ strq(v6, Address(state, 0));
+    __ strs(v7, Address(state, 16));
+
+    __ ret(lr);
+
+    __ bind(keys);
+    __ emit_int32(0x5a827999);
+    __ emit_int32(0x6ed9eba1);
+    __ emit_int32(0x8f1bbcdc);
+    __ emit_int32(0xca62c1d6);
+
+    return start;
+  }
+
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - byte[]  source+offset
+  //   c_rarg1   - int[]   SHA.state
+  //   c_rarg2   - int     offset
+  //   c_rarg3   - int     limit
+  //
+  address generate_sha256_implCompress(bool multi_block, const char *name) {
+    static const uint32_t round_consts[64] = {
+      0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+      0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+      0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+      0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+      0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+      0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+      0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+      0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+      0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+      0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+      0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+      0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+      0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+      0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+      0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+      0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
+    };
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    Register buf   = c_rarg0;
+    Register state = c_rarg1;
+    Register ofs   = c_rarg2;
+    Register limit = c_rarg3;
+
+    Label sha1_loop;
+
+    __ stpd(v8, v9, __ pre(sp, -32));
+    __ stpd(v10, v11, Address(sp, 16));
+
+// dga == v0
+// dgb == v1
+// dg0 == v2
+// dg1 == v3
+// dg2 == v4
+// t0 == v6
+// t1 == v7
+
+    // load 16 keys to v16..v31
+    __ lea(rscratch1, ExternalAddress((address)round_consts));
+    __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
+    __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
+    __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
+    __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
+
+    // load 8 words (256 bits) state
+    __ ldpq(v0, v1, state);
+
+    __ BIND(sha1_loop);
+    // load 64 bytes of data into v8..v11
+    __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
+    __ rev32(v8, __ T16B, v8);
+    __ rev32(v9, __ T16B, v9);
+    __ rev32(v10, __ T16B, v10);
+    __ rev32(v11, __ T16B, v11);
+
+    __ addv(v6, __ T4S, v8, v16);
+    __ orr(v2, __ T16B, v0, v0);
+    __ orr(v3, __ T16B, v1, v1);
+
+    FloatRegister d0 = v8;
+    FloatRegister d1 = v9;
+    FloatRegister d2 = v10;
+    FloatRegister d3 = v11;
+
+
+    for (int round = 0; round < 16; round++) {
+      FloatRegister tmp1 = (round & 1) ? v6 : v7;
+      FloatRegister tmp2 = (round & 1) ? v7 : v6;
+      FloatRegister tmp3 = (round & 1) ? v2 : v4;
+      FloatRegister tmp4 = (round & 1) ? v4 : v2;
+
+      if (round < 12) __ sha256su0(d0, __ T4S, d1);
+       __ orr(v4, __ T16B, v2, v2);
+      if (round < 15)
+        __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
+      __ sha256h(v2, __ T4S, v3, tmp2);
+      __ sha256h2(v3, __ T4S, v4, tmp2);
+      if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
+
+      tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
+    }
+
+    __ addv(v0, __ T4S, v0, v2);
+    __ addv(v1, __ T4S, v1, v3);
+
+    if (multi_block) {
+      __ add(ofs, ofs, 64);
+      __ cmp(ofs, limit);
+      __ br(Assembler::LE, sha1_loop);
+      __ mov(c_rarg0, ofs); // return ofs
+    }
+
+    __ ldpd(v10, v11, Address(sp, 16));
+    __ ldpd(v8, v9, __ post(sp, 32));
+
+    __ stpq(v0, v1, state);
+
+    __ ret(lr);
+
+    return start;
+  }
+
+#ifndef BUILTIN_SIM
+  // Safefetch stubs.
+  void generate_safefetch(const char* name, int size, address* entry,
+                          address* fault_pc, address* continuation_pc) {
+    // safefetch signatures:
+    //   int      SafeFetch32(int*      adr, int      errValue);
+    //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
+    //
+    // arguments:
+    //   c_rarg0 = adr
+    //   c_rarg1 = errValue
+    //
+    // result:
+    //   PPC_RET  = *adr or errValue
+
+    StubCodeMark mark(this, "StubRoutines", name);
+
+    // Entry point, pc or function descriptor.
+    *entry = __ pc();
+
+    // Load *adr into c_rarg1, may fault.
+    *fault_pc = __ pc();
+    switch (size) {
+      case 4:
+        // int32_t
+        __ ldrw(c_rarg1, Address(c_rarg0, 0));
+        break;
+      case 8:
+        // int64_t
+        __ ldr(c_rarg1, Address(c_rarg0, 0));
+        break;
+      default:
+        ShouldNotReachHere();
+    }
+
+    // return errValue or *adr
+    *continuation_pc = __ pc();
+    __ mov(r0, c_rarg1);
+    __ ret(lr);
+  }
+#endif
+
+  /**
+   *  Arguments:
+   *
+   * Inputs:
+   *   c_rarg0   - int crc
+   *   c_rarg1   - byte* buf
+   *   c_rarg2   - int length
+   *
+   * Ouput:
+   *       rax   - int crc result
+   */
+  address generate_updateBytesCRC32() {
+    assert(UseCRC32Intrinsics, "what are we doing here?");
+
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
+
+    address start = __ pc();
+
+    const Register crc   = c_rarg0;  // crc
+    const Register buf   = c_rarg1;  // source java byte array address
+    const Register len   = c_rarg2;  // length
+    const Register table0 = c_rarg3; // crc_table address
+    const Register table1 = c_rarg4;
+    const Register table2 = c_rarg5;
+    const Register table3 = c_rarg6;
+    const Register tmp3 = c_rarg7;
+
+    BLOCK_COMMENT("Entry:");
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+
+    __ kernel_crc32(crc, buf, len,
+              table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
+
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(lr);
+
+    return start;
+  }
+
+#undef __
+#define __ masm->
+
+  // Continuation point for throwing of implicit exceptions that are
+  // not handled in the current activation. Fabricates an exception
+  // oop and initiates normal exception dispatching in this
+  // frame. Since we need to preserve callee-saved values (currently
+  // only for C2, but done for C1 as well) we need a callee-saved oop
+  // map and therefore have to make these stubs into RuntimeStubs
+  // rather than BufferBlobs.  If the compiler needs all registers to
+  // be preserved between the fault point and the exception handler
+  // then it must assume responsibility for that in
+  // AbstractCompiler::continuation_for_implicit_null_exception or
+  // continuation_for_implicit_division_by_zero_exception. All other
+  // implicit exceptions (e.g., NullPointerException or
+  // AbstractMethodError on entry) are either at call sites or
+  // otherwise assume that stack unwinding will be initiated, so
+  // caller saved registers were assumed volatile in the compiler.
+
+  address generate_throw_exception(const char* name,
+                                   address runtime_entry,
+                                   Register arg1 = noreg,
+                                   Register arg2 = noreg) {
+    // Information about frame layout at time of blocking runtime call.
+    // Note that we only have to preserve callee-saved registers since
+    // the compilers are responsible for supplying a continuation point
+    // if they expect all registers to be preserved.
+    // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
+    enum layout {
+      rfp_off = 0,
+      rfp_off2,
+      return_off,
+      return_off2,
+      framesize // inclusive of return address
+    };
+
+    int insts_size = 512;
+    int locs_size  = 64;
+
+    CodeBuffer code(name, insts_size, locs_size);
+    OopMapSet* oop_maps  = new OopMapSet();
+    MacroAssembler* masm = new MacroAssembler(&code);
+
+    address start = __ pc();
+
+    // This is an inlined and slightly modified version of call_VM
+    // which has the ability to fetch the return PC out of
+    // thread-local storage and also sets up last_Java_sp slightly
+    // differently than the real call_VM
+
+    __ enter(); // Save FP and LR before call
+
+    assert(is_even(framesize/2), "sp not 16-byte aligned");
+
+    // lr and fp are already in place
+    __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
+
+    int frame_complete = __ pc() - start;
+
+    // Set up last_Java_sp and last_Java_fp
+    address the_pc = __ pc();
+    __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
+
+    // Call runtime
+    if (arg1 != noreg) {
+      assert(arg2 != c_rarg1, "clobbered");
+      __ mov(c_rarg1, arg1);
+    }
+    if (arg2 != noreg) {
+      __ mov(c_rarg2, arg2);
+    }
+    __ mov(c_rarg0, rthread);
+    BLOCK_COMMENT("call runtime_entry");
+    __ mov(rscratch1, runtime_entry);
+    __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
+
+    // Generate oop map
+    OopMap* map = new OopMap(framesize, 0);
+
+    oop_maps->add_gc_map(the_pc - start, map);
+
+    __ reset_last_Java_frame(true, true);
+    __ maybe_isb();
+
+    __ leave();
+
+    // check for pending exceptions
+#ifdef ASSERT
+    Label L;
+    __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+    __ cbnz(rscratch1, L);
+    __ should_not_reach_here();
+    __ bind(L);
+#endif // ASSERT
+    __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
+
+
+    // codeBlob framesize is in words (not VMRegImpl::slot_size)
+    RuntimeStub* stub =
+      RuntimeStub::new_runtime_stub(name,
+                                    &code,
+                                    frame_complete,
+                                    (framesize >> (LogBytesPerWord - LogBytesPerInt)),
+                                    oop_maps, false);
+    return stub->entry_point();
+  }
+
+  // Initialization
+  void generate_initial() {
+    // Generate initial stubs and initializes the entry points
+
+    // entry points that exist in all platforms Note: This is code
+    // that could be shared among different platforms - however the
+    // benefit seems to be smaller than the disadvantage of having a
+    // much more complicated generator structure. See also comment in
+    // stubRoutines.hpp.
+
+    StubRoutines::_forward_exception_entry = generate_forward_exception();
+
+    StubRoutines::_call_stub_entry =
+      generate_call_stub(StubRoutines::_call_stub_return_address);
+
+    // is referenced by megamorphic call
+    StubRoutines::_catch_exception_entry = generate_catch_exception();
+
+    // Build this early so it's available for the interpreter.
+    StubRoutines::_throw_StackOverflowError_entry =
+      generate_throw_exception("StackOverflowError throw_exception",
+                               CAST_FROM_FN_PTR(address,
+                                                SharedRuntime::
+                                                throw_StackOverflowError));
+    if (UseCRC32Intrinsics) {
+      // set table address before stub generation which use it
+      StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
+      StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
+    }
+  }
+
+  void generate_all() {
+    // support for verify_oop (must happen after universe_init)
+    StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
+    StubRoutines::_throw_AbstractMethodError_entry =
+      generate_throw_exception("AbstractMethodError throw_exception",
+                               CAST_FROM_FN_PTR(address,
+                                                SharedRuntime::
+                                                throw_AbstractMethodError));
+
+    StubRoutines::_throw_IncompatibleClassChangeError_entry =
+      generate_throw_exception("IncompatibleClassChangeError throw_exception",
+                               CAST_FROM_FN_PTR(address,
+                                                SharedRuntime::
+                                                throw_IncompatibleClassChangeError));
+
+    StubRoutines::_throw_NullPointerException_at_call_entry =
+      generate_throw_exception("NullPointerException at call throw_exception",
+                               CAST_FROM_FN_PTR(address,
+                                                SharedRuntime::
+                                                throw_NullPointerException_at_call));
+
+    // arraycopy stubs used by compilers
+    generate_arraycopy_stubs();
+
+#ifndef BUILTIN_SIM
+    if (UseAESIntrinsics) {
+      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
+      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
+      StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
+      StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
+    }
+
+    if (UseSHA1Intrinsics) {
+      StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
+      StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
+    }
+    if (UseSHA256Intrinsics) {
+      StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
+      StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
+    }
+
+    // Safefetch stubs.
+    generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
+                                                       &StubRoutines::_safefetch32_fault_pc,
+                                                       &StubRoutines::_safefetch32_continuation_pc);
+    generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
+                                                       &StubRoutines::_safefetchN_fault_pc,
+                                                       &StubRoutines::_safefetchN_continuation_pc);
+#endif
+  }
+
+ public:
+  StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
+    if (all) {
+      generate_all();
+    } else {
+      generate_initial();
+    }
+  }
+}; // end class declaration
+
+void StubGenerator_generate(CodeBuffer* code, bool all) {
+  StubGenerator g(code, all);
+}

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "runtime/deoptimization.hpp"
+#include "runtime/frame.inline.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "runtime/thread.inline.hpp"
+
+// Implementation of the platform-specific part of StubRoutines - for
+// a description of how to extend it, see the stubRoutines.hpp file.
+
+address StubRoutines::aarch64::_get_previous_fp_entry = NULL;
+address StubRoutines::aarch64::_get_previous_sp_entry = NULL;
+
+address StubRoutines::aarch64::_f2i_fixup = NULL;
+address StubRoutines::aarch64::_f2l_fixup = NULL;
+address StubRoutines::aarch64::_d2i_fixup = NULL;
+address StubRoutines::aarch64::_d2l_fixup = NULL;
+address StubRoutines::aarch64::_float_sign_mask = NULL;
+address StubRoutines::aarch64::_float_sign_flip = NULL;
+address StubRoutines::aarch64::_double_sign_mask = NULL;
+address StubRoutines::aarch64::_double_sign_flip = NULL;
+
+/**
+ *  crc_table[] from jdk/src/share/native/java/util/zip/zlib-1.2.5/crc32.h
+ */
+juint StubRoutines::aarch64::_crc_table[]
+                   __attribute__ ((aligned(4096))) =
+{
+    // Table 0
+    0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
+    0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
+    0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
+    0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
+    0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
+    0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
+    0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
+    0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
+    0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
+    0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
+    0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
+    0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
+    0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
+    0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
+    0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
+    0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
+    0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
+    0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
+    0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
+    0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
+    0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
+    0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
+    0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
+    0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
+    0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
+    0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
+    0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
+    0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
+    0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
+    0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
+    0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
+    0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
+    0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
+    0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
+    0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
+    0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
+    0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
+    0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
+    0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
+    0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
+    0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
+    0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
+    0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
+    0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
+    0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
+    0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
+    0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
+    0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
+    0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
+    0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
+    0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
+    0x2d02ef8dUL,
+
+    // Table 1
+    0x00000000UL, 0x191b3141UL, 0x32366282UL, 0x2b2d53c3UL, 0x646cc504UL,
+    0x7d77f445UL, 0x565aa786UL, 0x4f4196c7UL, 0xc8d98a08UL, 0xd1c2bb49UL,
+    0xfaefe88aUL, 0xe3f4d9cbUL, 0xacb54f0cUL, 0xb5ae7e4dUL, 0x9e832d8eUL,
+    0x87981ccfUL, 0x4ac21251UL, 0x53d92310UL, 0x78f470d3UL, 0x61ef4192UL,
+    0x2eaed755UL, 0x37b5e614UL, 0x1c98b5d7UL, 0x05838496UL, 0x821b9859UL,
+    0x9b00a918UL, 0xb02dfadbUL, 0xa936cb9aUL, 0xe6775d5dUL, 0xff6c6c1cUL,
+    0xd4413fdfUL, 0xcd5a0e9eUL, 0x958424a2UL, 0x8c9f15e3UL, 0xa7b24620UL,
+    0xbea97761UL, 0xf1e8e1a6UL, 0xe8f3d0e7UL, 0xc3de8324UL, 0xdac5b265UL,
+    0x5d5daeaaUL, 0x44469febUL, 0x6f6bcc28UL, 0x7670fd69UL, 0x39316baeUL,
+    0x202a5aefUL, 0x0b07092cUL, 0x121c386dUL, 0xdf4636f3UL, 0xc65d07b2UL,
+    0xed705471UL, 0xf46b6530UL, 0xbb2af3f7UL, 0xa231c2b6UL, 0x891c9175UL,
+    0x9007a034UL, 0x179fbcfbUL, 0x0e848dbaUL, 0x25a9de79UL, 0x3cb2ef38UL,
+    0x73f379ffUL, 0x6ae848beUL, 0x41c51b7dUL, 0x58de2a3cUL, 0xf0794f05UL,
+    0xe9627e44UL, 0xc24f2d87UL, 0xdb541cc6UL, 0x94158a01UL, 0x8d0ebb40UL,
+    0xa623e883UL, 0xbf38d9c2UL, 0x38a0c50dUL, 0x21bbf44cUL, 0x0a96a78fUL,
+    0x138d96ceUL, 0x5ccc0009UL, 0x45d73148UL, 0x6efa628bUL, 0x77e153caUL,
+    0xbabb5d54UL, 0xa3a06c15UL, 0x888d3fd6UL, 0x91960e97UL, 0xded79850UL,
+    0xc7cca911UL, 0xece1fad2UL, 0xf5facb93UL, 0x7262d75cUL, 0x6b79e61dUL,
+    0x4054b5deUL, 0x594f849fUL, 0x160e1258UL, 0x0f152319UL, 0x243870daUL,
+    0x3d23419bUL, 0x65fd6ba7UL, 0x7ce65ae6UL, 0x57cb0925UL, 0x4ed03864UL,
+    0x0191aea3UL, 0x188a9fe2UL, 0x33a7cc21UL, 0x2abcfd60UL, 0xad24e1afUL,
+    0xb43fd0eeUL, 0x9f12832dUL, 0x8609b26cUL, 0xc94824abUL, 0xd05315eaUL,
+    0xfb7e4629UL, 0xe2657768UL, 0x2f3f79f6UL, 0x362448b7UL, 0x1d091b74UL,
+    0x04122a35UL, 0x4b53bcf2UL, 0x52488db3UL, 0x7965de70UL, 0x607eef31UL,
+    0xe7e6f3feUL, 0xfefdc2bfUL, 0xd5d0917cUL, 0xcccba03dUL, 0x838a36faUL,
+    0x9a9107bbUL, 0xb1bc5478UL, 0xa8a76539UL, 0x3b83984bUL, 0x2298a90aUL,
+    0x09b5fac9UL, 0x10aecb88UL, 0x5fef5d4fUL, 0x46f46c0eUL, 0x6dd93fcdUL,
+    0x74c20e8cUL, 0xf35a1243UL, 0xea412302UL, 0xc16c70c1UL, 0xd8774180UL,
+    0x9736d747UL, 0x8e2de606UL, 0xa500b5c5UL, 0xbc1b8484UL, 0x71418a1aUL,
+    0x685abb5bUL, 0x4377e898UL, 0x5a6cd9d9UL, 0x152d4f1eUL, 0x0c367e5fUL,
+    0x271b2d9cUL, 0x3e001cddUL, 0xb9980012UL, 0xa0833153UL, 0x8bae6290UL,
+    0x92b553d1UL, 0xddf4c516UL, 0xc4eff457UL, 0xefc2a794UL, 0xf6d996d5UL,
+    0xae07bce9UL, 0xb71c8da8UL, 0x9c31de6bUL, 0x852aef2aUL, 0xca6b79edUL,
+    0xd37048acUL, 0xf85d1b6fUL, 0xe1462a2eUL, 0x66de36e1UL, 0x7fc507a0UL,
+    0x54e85463UL, 0x4df36522UL, 0x02b2f3e5UL, 0x1ba9c2a4UL, 0x30849167UL,
+    0x299fa026UL, 0xe4c5aeb8UL, 0xfdde9ff9UL, 0xd6f3cc3aUL, 0xcfe8fd7bUL,
+    0x80a96bbcUL, 0x99b25afdUL, 0xb29f093eUL, 0xab84387fUL, 0x2c1c24b0UL,
+    0x350715f1UL, 0x1e2a4632UL, 0x07317773UL, 0x4870e1b4UL, 0x516bd0f5UL,
+    0x7a468336UL, 0x635db277UL, 0xcbfad74eUL, 0xd2e1e60fUL, 0xf9ccb5ccUL,
+    0xe0d7848dUL, 0xaf96124aUL, 0xb68d230bUL, 0x9da070c8UL, 0x84bb4189UL,
+    0x03235d46UL, 0x1a386c07UL, 0x31153fc4UL, 0x280e0e85UL, 0x674f9842UL,
+    0x7e54a903UL, 0x5579fac0UL, 0x4c62cb81UL, 0x8138c51fUL, 0x9823f45eUL,
+    0xb30ea79dUL, 0xaa1596dcUL, 0xe554001bUL, 0xfc4f315aUL, 0xd7626299UL,
+    0xce7953d8UL, 0x49e14f17UL, 0x50fa7e56UL, 0x7bd72d95UL, 0x62cc1cd4UL,
+    0x2d8d8a13UL, 0x3496bb52UL, 0x1fbbe891UL, 0x06a0d9d0UL, 0x5e7ef3ecUL,
+    0x4765c2adUL, 0x6c48916eUL, 0x7553a02fUL, 0x3a1236e8UL, 0x230907a9UL,
+    0x0824546aUL, 0x113f652bUL, 0x96a779e4UL, 0x8fbc48a5UL, 0xa4911b66UL,
+    0xbd8a2a27UL, 0xf2cbbce0UL, 0xebd08da1UL, 0xc0fdde62UL, 0xd9e6ef23UL,
+    0x14bce1bdUL, 0x0da7d0fcUL, 0x268a833fUL, 0x3f91b27eUL, 0x70d024b9UL,
+    0x69cb15f8UL, 0x42e6463bUL, 0x5bfd777aUL, 0xdc656bb5UL, 0xc57e5af4UL,
+    0xee530937UL, 0xf7483876UL, 0xb809aeb1UL, 0xa1129ff0UL, 0x8a3fcc33UL,
+    0x9324fd72UL,
+
+    // Table 2
+    0x00000000UL, 0x01c26a37UL, 0x0384d46eUL, 0x0246be59UL, 0x0709a8dcUL,
+    0x06cbc2ebUL, 0x048d7cb2UL, 0x054f1685UL, 0x0e1351b8UL, 0x0fd13b8fUL,
+    0x0d9785d6UL, 0x0c55efe1UL, 0x091af964UL, 0x08d89353UL, 0x0a9e2d0aUL,
+    0x0b5c473dUL, 0x1c26a370UL, 0x1de4c947UL, 0x1fa2771eUL, 0x1e601d29UL,
+    0x1b2f0bacUL, 0x1aed619bUL, 0x18abdfc2UL, 0x1969b5f5UL, 0x1235f2c8UL,
+    0x13f798ffUL, 0x11b126a6UL, 0x10734c91UL, 0x153c5a14UL, 0x14fe3023UL,
+    0x16b88e7aUL, 0x177ae44dUL, 0x384d46e0UL, 0x398f2cd7UL, 0x3bc9928eUL,
+    0x3a0bf8b9UL, 0x3f44ee3cUL, 0x3e86840bUL, 0x3cc03a52UL, 0x3d025065UL,
+    0x365e1758UL, 0x379c7d6fUL, 0x35dac336UL, 0x3418a901UL, 0x3157bf84UL,
+    0x3095d5b3UL, 0x32d36beaUL, 0x331101ddUL, 0x246be590UL, 0x25a98fa7UL,
+    0x27ef31feUL, 0x262d5bc9UL, 0x23624d4cUL, 0x22a0277bUL, 0x20e69922UL,
+    0x2124f315UL, 0x2a78b428UL, 0x2bbade1fUL, 0x29fc6046UL, 0x283e0a71UL,
+    0x2d711cf4UL, 0x2cb376c3UL, 0x2ef5c89aUL, 0x2f37a2adUL, 0x709a8dc0UL,
+    0x7158e7f7UL, 0x731e59aeUL, 0x72dc3399UL, 0x7793251cUL, 0x76514f2bUL,
+    0x7417f172UL, 0x75d59b45UL, 0x7e89dc78UL, 0x7f4bb64fUL, 0x7d0d0816UL,
+    0x7ccf6221UL, 0x798074a4UL, 0x78421e93UL, 0x7a04a0caUL, 0x7bc6cafdUL,
+    0x6cbc2eb0UL, 0x6d7e4487UL, 0x6f38fadeUL, 0x6efa90e9UL, 0x6bb5866cUL,
+    0x6a77ec5bUL, 0x68315202UL, 0x69f33835UL, 0x62af7f08UL, 0x636d153fUL,
+    0x612bab66UL, 0x60e9c151UL, 0x65a6d7d4UL, 0x6464bde3UL, 0x662203baUL,
+    0x67e0698dUL, 0x48d7cb20UL, 0x4915a117UL, 0x4b531f4eUL, 0x4a917579UL,
+    0x4fde63fcUL, 0x4e1c09cbUL, 0x4c5ab792UL, 0x4d98dda5UL, 0x46c49a98UL,
+    0x4706f0afUL, 0x45404ef6UL, 0x448224c1UL, 0x41cd3244UL, 0x400f5873UL,
+    0x4249e62aUL, 0x438b8c1dUL, 0x54f16850UL, 0x55330267UL, 0x5775bc3eUL,
+    0x56b7d609UL, 0x53f8c08cUL, 0x523aaabbUL, 0x507c14e2UL, 0x51be7ed5UL,
+    0x5ae239e8UL, 0x5b2053dfUL, 0x5966ed86UL, 0x58a487b1UL, 0x5deb9134UL,
+    0x5c29fb03UL, 0x5e6f455aUL, 0x5fad2f6dUL, 0xe1351b80UL, 0xe0f771b7UL,
+    0xe2b1cfeeUL, 0xe373a5d9UL, 0xe63cb35cUL, 0xe7fed96bUL, 0xe5b86732UL,
+    0xe47a0d05UL, 0xef264a38UL, 0xeee4200fUL, 0xeca29e56UL, 0xed60f461UL,
+    0xe82fe2e4UL, 0xe9ed88d3UL, 0xebab368aUL, 0xea695cbdUL, 0xfd13b8f0UL,
+    0xfcd1d2c7UL, 0xfe976c9eUL, 0xff5506a9UL, 0xfa1a102cUL, 0xfbd87a1bUL,
+    0xf99ec442UL, 0xf85cae75UL, 0xf300e948UL, 0xf2c2837fUL, 0xf0843d26UL,
+    0xf1465711UL, 0xf4094194UL, 0xf5cb2ba3UL, 0xf78d95faUL, 0xf64fffcdUL,
+    0xd9785d60UL, 0xd8ba3757UL, 0xdafc890eUL, 0xdb3ee339UL, 0xde71f5bcUL,
+    0xdfb39f8bUL, 0xddf521d2UL, 0xdc374be5UL, 0xd76b0cd8UL, 0xd6a966efUL,
+    0xd4efd8b6UL, 0xd52db281UL, 0xd062a404UL, 0xd1a0ce33UL, 0xd3e6706aUL,
+    0xd2241a5dUL, 0xc55efe10UL, 0xc49c9427UL, 0xc6da2a7eUL, 0xc7184049UL,
+    0xc25756ccUL, 0xc3953cfbUL, 0xc1d382a2UL, 0xc011e895UL, 0xcb4dafa8UL,
+    0xca8fc59fUL, 0xc8c97bc6UL, 0xc90b11f1UL, 0xcc440774UL, 0xcd866d43UL,
+    0xcfc0d31aUL, 0xce02b92dUL, 0x91af9640UL, 0x906dfc77UL, 0x922b422eUL,
+    0x93e92819UL, 0x96a63e9cUL, 0x976454abUL, 0x9522eaf2UL, 0x94e080c5UL,
+    0x9fbcc7f8UL, 0x9e7eadcfUL, 0x9c381396UL, 0x9dfa79a1UL, 0x98b56f24UL,
+    0x99770513UL, 0x9b31bb4aUL, 0x9af3d17dUL, 0x8d893530UL, 0x8c4b5f07UL,
+    0x8e0de15eUL, 0x8fcf8b69UL, 0x8a809decUL, 0x8b42f7dbUL, 0x89044982UL,
+    0x88c623b5UL, 0x839a6488UL, 0x82580ebfUL, 0x801eb0e6UL, 0x81dcdad1UL,
+    0x8493cc54UL, 0x8551a663UL, 0x8717183aUL, 0x86d5720dUL, 0xa9e2d0a0UL,
+    0xa820ba97UL, 0xaa6604ceUL, 0xaba46ef9UL, 0xaeeb787cUL, 0xaf29124bUL,
+    0xad6fac12UL, 0xacadc625UL, 0xa7f18118UL, 0xa633eb2fUL, 0xa4755576UL,
+    0xa5b73f41UL, 0xa0f829c4UL, 0xa13a43f3UL, 0xa37cfdaaUL, 0xa2be979dUL,
+    0xb5c473d0UL, 0xb40619e7UL, 0xb640a7beUL, 0xb782cd89UL, 0xb2cddb0cUL,
+    0xb30fb13bUL, 0xb1490f62UL, 0xb08b6555UL, 0xbbd72268UL, 0xba15485fUL,
+    0xb853f606UL, 0xb9919c31UL, 0xbcde8ab4UL, 0xbd1ce083UL, 0xbf5a5edaUL,
+    0xbe9834edUL,
+
+    // Table 3
+    0x00000000UL, 0xb8bc6765UL, 0xaa09c88bUL, 0x12b5afeeUL, 0x8f629757UL,
+    0x37def032UL, 0x256b5fdcUL, 0x9dd738b9UL, 0xc5b428efUL, 0x7d084f8aUL,
+    0x6fbde064UL, 0xd7018701UL, 0x4ad6bfb8UL, 0xf26ad8ddUL, 0xe0df7733UL,
+    0x58631056UL, 0x5019579fUL, 0xe8a530faUL, 0xfa109f14UL, 0x42acf871UL,
+    0xdf7bc0c8UL, 0x67c7a7adUL, 0x75720843UL, 0xcdce6f26UL, 0x95ad7f70UL,
+    0x2d111815UL, 0x3fa4b7fbUL, 0x8718d09eUL, 0x1acfe827UL, 0xa2738f42UL,
+    0xb0c620acUL, 0x087a47c9UL, 0xa032af3eUL, 0x188ec85bUL, 0x0a3b67b5UL,
+    0xb28700d0UL, 0x2f503869UL, 0x97ec5f0cUL, 0x8559f0e2UL, 0x3de59787UL,
+    0x658687d1UL, 0xdd3ae0b4UL, 0xcf8f4f5aUL, 0x7733283fUL, 0xeae41086UL,
+    0x525877e3UL, 0x40edd80dUL, 0xf851bf68UL, 0xf02bf8a1UL, 0x48979fc4UL,
+    0x5a22302aUL, 0xe29e574fUL, 0x7f496ff6UL, 0xc7f50893UL, 0xd540a77dUL,
+    0x6dfcc018UL, 0x359fd04eUL, 0x8d23b72bUL, 0x9f9618c5UL, 0x272a7fa0UL,
+    0xbafd4719UL, 0x0241207cUL, 0x10f48f92UL, 0xa848e8f7UL, 0x9b14583dUL,
+    0x23a83f58UL, 0x311d90b6UL, 0x89a1f7d3UL, 0x1476cf6aUL, 0xaccaa80fUL,
+    0xbe7f07e1UL, 0x06c36084UL, 0x5ea070d2UL, 0xe61c17b7UL, 0xf4a9b859UL,
+    0x4c15df3cUL, 0xd1c2e785UL, 0x697e80e0UL, 0x7bcb2f0eUL, 0xc377486bUL,
+    0xcb0d0fa2UL, 0x73b168c7UL, 0x6104c729UL, 0xd9b8a04cUL, 0x446f98f5UL,
+    0xfcd3ff90UL, 0xee66507eUL, 0x56da371bUL, 0x0eb9274dUL, 0xb6054028UL,
+    0xa4b0efc6UL, 0x1c0c88a3UL, 0x81dbb01aUL, 0x3967d77fUL, 0x2bd27891UL,
+    0x936e1ff4UL, 0x3b26f703UL, 0x839a9066UL, 0x912f3f88UL, 0x299358edUL,
+    0xb4446054UL, 0x0cf80731UL, 0x1e4da8dfUL, 0xa6f1cfbaUL, 0xfe92dfecUL,
+    0x462eb889UL, 0x549b1767UL, 0xec277002UL, 0x71f048bbUL, 0xc94c2fdeUL,
+    0xdbf98030UL, 0x6345e755UL, 0x6b3fa09cUL, 0xd383c7f9UL, 0xc1366817UL,
+    0x798a0f72UL, 0xe45d37cbUL, 0x5ce150aeUL, 0x4e54ff40UL, 0xf6e89825UL,
+    0xae8b8873UL, 0x1637ef16UL, 0x048240f8UL, 0xbc3e279dUL, 0x21e91f24UL,
+    0x99557841UL, 0x8be0d7afUL, 0x335cb0caUL, 0xed59b63bUL, 0x55e5d15eUL,
+    0x47507eb0UL, 0xffec19d5UL, 0x623b216cUL, 0xda874609UL, 0xc832e9e7UL,
+    0x708e8e82UL, 0x28ed9ed4UL, 0x9051f9b1UL, 0x82e4565fUL, 0x3a58313aUL,
+    0xa78f0983UL, 0x1f336ee6UL, 0x0d86c108UL, 0xb53aa66dUL, 0xbd40e1a4UL,
+    0x05fc86c1UL, 0x1749292fUL, 0xaff54e4aUL, 0x322276f3UL, 0x8a9e1196UL,
+    0x982bbe78UL, 0x2097d91dUL, 0x78f4c94bUL, 0xc048ae2eUL, 0xd2fd01c0UL,
+    0x6a4166a5UL, 0xf7965e1cUL, 0x4f2a3979UL, 0x5d9f9697UL, 0xe523f1f2UL,
+    0x4d6b1905UL, 0xf5d77e60UL, 0xe762d18eUL, 0x5fdeb6ebUL, 0xc2098e52UL,
+    0x7ab5e937UL, 0x680046d9UL, 0xd0bc21bcUL, 0x88df31eaUL, 0x3063568fUL,
+    0x22d6f961UL, 0x9a6a9e04UL, 0x07bda6bdUL, 0xbf01c1d8UL, 0xadb46e36UL,
+    0x15080953UL, 0x1d724e9aUL, 0xa5ce29ffUL, 0xb77b8611UL, 0x0fc7e174UL,
+    0x9210d9cdUL, 0x2aacbea8UL, 0x38191146UL, 0x80a57623UL, 0xd8c66675UL,
+    0x607a0110UL, 0x72cfaefeUL, 0xca73c99bUL, 0x57a4f122UL, 0xef189647UL,
+    0xfdad39a9UL, 0x45115eccUL, 0x764dee06UL, 0xcef18963UL, 0xdc44268dUL,
+    0x64f841e8UL, 0xf92f7951UL, 0x41931e34UL, 0x5326b1daUL, 0xeb9ad6bfUL,
+    0xb3f9c6e9UL, 0x0b45a18cUL, 0x19f00e62UL, 0xa14c6907UL, 0x3c9b51beUL,
+    0x842736dbUL, 0x96929935UL, 0x2e2efe50UL, 0x2654b999UL, 0x9ee8defcUL,
+    0x8c5d7112UL, 0x34e11677UL, 0xa9362eceUL, 0x118a49abUL, 0x033fe645UL,
+    0xbb838120UL, 0xe3e09176UL, 0x5b5cf613UL, 0x49e959fdUL, 0xf1553e98UL,
+    0x6c820621UL, 0xd43e6144UL, 0xc68bceaaUL, 0x7e37a9cfUL, 0xd67f4138UL,
+    0x6ec3265dUL, 0x7c7689b3UL, 0xc4caeed6UL, 0x591dd66fUL, 0xe1a1b10aUL,
+    0xf3141ee4UL, 0x4ba87981UL, 0x13cb69d7UL, 0xab770eb2UL, 0xb9c2a15cUL,
+    0x017ec639UL, 0x9ca9fe80UL, 0x241599e5UL, 0x36a0360bUL, 0x8e1c516eUL,
+    0x866616a7UL, 0x3eda71c2UL, 0x2c6fde2cUL, 0x94d3b949UL, 0x090481f0UL,
+    0xb1b8e695UL, 0xa30d497bUL, 0x1bb12e1eUL, 0x43d23e48UL, 0xfb6e592dUL,
+    0xe9dbf6c3UL, 0x516791a6UL, 0xccb0a91fUL, 0x740cce7aUL, 0x66b96194UL,
+    0xde0506f1UL,
+    // Constants for Neon CRC232 implementation
+    // k3 = 0x78ED02D5 = x^288 mod poly - bit reversed
+    // k4 = 0xED627DAE = x^256 mod poly - bit reversed
+    0x78ED02D5UL, 0xED627DAEUL,         // k4:k3
+    0xED78D502UL, 0x62EDAE7DUL,         // byte swap
+    0x02D578EDUL, 0x7DAEED62UL,         // word swap
+    0xD502ED78UL, 0xAE7D62EDUL,         // byte swap of word swap
+};

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2003, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_STUBROUTINES_AARCH64_HPP
+#define CPU_AARCH64_VM_STUBROUTINES_AARCH64_HPP
+
+// This file holds the platform specific parts of the StubRoutines
+// definition. See stubRoutines.hpp for a description on how to
+// extend it.
+
+// n.b. if we are notifying entry/exit to the simulator then the call
+// stub does a notify at normal return placing
+// call_stub_return_address one instruction beyond the notify. the
+// latter address is sued by the stack unwind code when doign an
+// exception return.
+static bool    returns_to_call_stub(address return_pc)   {
+  return return_pc == _call_stub_return_address + (NotifySimulator ? -4 : 0);
+}
+
+enum platform_dependent_constants {
+  code_size1 = 19000,          // simply increase if too small (assembler will crash if too small)
+  code_size2 = 22000           // simply increase if too small (assembler will crash if too small)
+};
+
+class aarch64 {
+ friend class StubGenerator;
+
+ private:
+  static address _get_previous_fp_entry;
+  static address _get_previous_sp_entry;
+
+  static address _f2i_fixup;
+  static address _f2l_fixup;
+  static address _d2i_fixup;
+  static address _d2l_fixup;
+
+  static address _float_sign_mask;
+  static address _float_sign_flip;
+  static address _double_sign_mask;
+  static address _double_sign_flip;
+
+ public:
+
+  static address get_previous_fp_entry()
+  {
+    return _get_previous_fp_entry;
+  }
+
+  static address get_previous_sp_entry()
+  {
+    return _get_previous_sp_entry;
+  }
+
+  static address f2i_fixup()
+  {
+    return _f2i_fixup;
+  }
+
+  static address f2l_fixup()
+  {
+    return _f2l_fixup;
+  }
+
+  static address d2i_fixup()
+  {
+    return _d2i_fixup;
+  }
+
+  static address d2l_fixup()
+  {
+    return _d2l_fixup;
+  }
+
+  static address float_sign_mask()
+  {
+    return _float_sign_mask;
+  }
+
+  static address float_sign_flip()
+  {
+    return _float_sign_flip;
+  }
+
+  static address double_sign_mask()
+  {
+    return _double_sign_mask;
+  }
+
+  static address double_sign_flip()
+  {
+    return _double_sign_flip;
+  }
+
+ private:
+  static juint    _crc_table[];
+
+};
+
+#endif // CPU_AARCH64_VM_STUBROUTINES_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/templateInterpreterGenerator_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_TEMPLATEINTERPRETERGENERATOR_AARCH64_HPP
+#define CPU_AARCH64_VM_TEMPLATEINTERPRETERGENERATOR_AARCH64_HPP
+
+ protected:
+
+void generate_fixed_frame(bool native_call);
+
+ // address generate_asm_interpreter_entry(bool synchronized);
+
+#endif // CPU_AARCH64_VM_TEMPLATEINTERPRETERGENERATOR_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,2078 @@
+/*
+ * Copyright (c) 2003, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "interpreter/bytecodeHistogram.hpp"
+#include "interpreter/interpreter.hpp"
+#include "interpreter/interpreterGenerator.hpp"
+#include "interpreter/interpreterRuntime.hpp"
+#include "interpreter/interp_masm.hpp"
+#include "interpreter/templateTable.hpp"
+#include "interpreter/bytecodeTracer.hpp"
+#include "oops/arrayOop.hpp"
+#include "oops/methodData.hpp"
+#include "oops/method.hpp"
+#include "oops/oop.inline.hpp"
+#include "prims/jvmtiExport.hpp"
+#include "prims/jvmtiThreadState.hpp"
+#include "runtime/arguments.hpp"
+#include "runtime/deoptimization.hpp"
+#include "runtime/frame.inline.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "runtime/synchronizer.hpp"
+#include "runtime/timer.hpp"
+#include "runtime/vframeArray.hpp"
+#include "utilities/debug.hpp"
+#include <sys/types.h>
+
+#ifndef PRODUCT
+#include "oops/method.hpp"
+#endif // !PRODUCT
+
+#ifdef BUILTIN_SIM
+#include "../../../../../../simulator/simulator.hpp"
+#endif
+
+#define __ _masm->
+
+#ifndef CC_INTERP
+
+//-----------------------------------------------------------------------------
+
+extern "C" void entry(CodeBuffer*);
+
+//-----------------------------------------------------------------------------
+
+address TemplateInterpreterGenerator::generate_StackOverflowError_handler() {
+  address entry = __ pc();
+
+#ifdef ASSERT
+  {
+    Label L;
+    __ ldr(rscratch1, Address(rfp,
+                       frame::interpreter_frame_monitor_block_top_offset *
+                       wordSize));
+    __ mov(rscratch2, sp);
+    __ cmp(rscratch1, rscratch2); // maximal rsp for current rfp (stack
+                           // grows negative)
+    __ br(Assembler::HS, L); // check if frame is complete
+    __ stop ("interpreter frame not set up");
+    __ bind(L);
+  }
+#endif // ASSERT
+  // Restore bcp under the assumption that the current frame is still
+  // interpreted
+  __ restore_bcp();
+
+  // expression stack must be empty before entering the VM if an
+  // exception happened
+  __ empty_expression_stack();
+  // throw exception
+  __ call_VM(noreg,
+             CAST_FROM_FN_PTR(address,
+                              InterpreterRuntime::throw_StackOverflowError));
+  return entry;
+}
+
+address TemplateInterpreterGenerator::generate_ArrayIndexOutOfBounds_handler(
+        const char* name) {
+  address entry = __ pc();
+  // expression stack must be empty before entering the VM if an
+  // exception happened
+  __ empty_expression_stack();
+  // setup parameters
+  // ??? convention: expect aberrant index in register r1
+  __ movw(c_rarg2, r1);
+  __ mov(c_rarg1, (address)name);
+  __ call_VM(noreg,
+             CAST_FROM_FN_PTR(address,
+                              InterpreterRuntime::
+                              throw_ArrayIndexOutOfBoundsException),
+             c_rarg1, c_rarg2);
+  return entry;
+}
+
+address TemplateInterpreterGenerator::generate_ClassCastException_handler() {
+  address entry = __ pc();
+
+  // object is at TOS
+  __ pop(c_rarg1);
+
+  // expression stack must be empty before entering the VM if an
+  // exception happened
+  __ empty_expression_stack();
+
+  __ call_VM(noreg,
+             CAST_FROM_FN_PTR(address,
+                              InterpreterRuntime::
+                              throw_ClassCastException),
+             c_rarg1);
+  return entry;
+}
+
+address TemplateInterpreterGenerator::generate_exception_handler_common(
+        const char* name, const char* message, bool pass_oop) {
+  assert(!pass_oop || message == NULL, "either oop or message but not both");
+  address entry = __ pc();
+  if (pass_oop) {
+    // object is at TOS
+    __ pop(c_rarg2);
+  }
+  // expression stack must be empty before entering the VM if an
+  // exception happened
+  __ empty_expression_stack();
+  // setup parameters
+  __ lea(c_rarg1, Address((address)name));
+  if (pass_oop) {
+    __ call_VM(r0, CAST_FROM_FN_PTR(address,
+                                    InterpreterRuntime::
+                                    create_klass_exception),
+               c_rarg1, c_rarg2);
+  } else {
+    // kind of lame ExternalAddress can't take NULL because
+    // external_word_Relocation will assert.
+    if (message != NULL) {
+      __ lea(c_rarg2, Address((address)message));
+    } else {
+      __ mov(c_rarg2, NULL_WORD);
+    }
+    __ call_VM(r0,
+               CAST_FROM_FN_PTR(address, InterpreterRuntime::create_exception),
+               c_rarg1, c_rarg2);
+  }
+  // throw exception
+  __ b(address(Interpreter::throw_exception_entry()));
+  return entry;
+}
+
+address TemplateInterpreterGenerator::generate_continuation_for(TosState state) {
+  address entry = __ pc();
+  // NULL last_sp until next java call
+  __ str(zr, Address(rfp, frame::interpreter_frame_last_sp_offset * wordSize));
+  __ dispatch_next(state);
+  return entry;
+}
+
+address TemplateInterpreterGenerator::generate_return_entry_for(TosState state, int step, size_t index_size) {
+  address entry = __ pc();
+
+  // Restore stack bottom in case i2c adjusted stack
+  __ ldr(esp, Address(rfp, frame::interpreter_frame_last_sp_offset * wordSize));
+  // and NULL it as marker that esp is now tos until next java call
+  __ str(zr, Address(rfp, frame::interpreter_frame_last_sp_offset * wordSize));
+  __ restore_bcp();
+  __ restore_locals();
+  __ restore_constant_pool_cache();
+  __ get_method(rmethod);
+
+  // Pop N words from the stack
+  __ get_cache_and_index_at_bcp(r1, r2, 1, index_size);
+  __ ldr(r1, Address(r1, ConstantPoolCache::base_offset() + ConstantPoolCacheEntry::flags_offset()));
+  __ andr(r1, r1, ConstantPoolCacheEntry::parameter_size_mask);
+
+  __ add(esp, esp, r1, Assembler::LSL, 3);
+
+  // Restore machine SP
+  __ ldr(rscratch1, Address(rmethod, Method::const_offset()));
+  __ ldrh(rscratch1, Address(rscratch1, ConstMethod::max_stack_offset()));
+  __ add(rscratch1, rscratch1, frame::interpreter_frame_monitor_size() + 2);
+  __ ldr(rscratch2,
+         Address(rfp, frame::interpreter_frame_initial_sp_offset * wordSize));
+  __ sub(rscratch1, rscratch2, rscratch1, ext::uxtw, 3);
+  __ andr(sp, rscratch1, -16);
+
+#ifndef PRODUCT
+  // tell the simulator that the method has been reentered
+  if (NotifySimulator) {
+    __ notify(Assembler::method_reentry);
+  }
+#endif
+  __ get_dispatch();
+  __ dispatch_next(state, step);
+
+  return entry;
+}
+
+address TemplateInterpreterGenerator::generate_deopt_entry_for(TosState state,
+                                                               int step) {
+  address entry = __ pc();
+  __ restore_bcp();
+  __ restore_locals();
+  __ restore_constant_pool_cache();
+  __ get_method(rmethod);
+
+  // handle exceptions
+  {
+    Label L;
+    __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+    __ cbz(rscratch1, L);
+    __ call_VM(noreg,
+               CAST_FROM_FN_PTR(address,
+                                InterpreterRuntime::throw_pending_exception));
+    __ should_not_reach_here();
+    __ bind(L);
+  }
+
+  __ get_dispatch();
+
+  // Calculate stack limit
+  __ ldr(rscratch1, Address(rmethod, Method::const_offset()));
+  __ ldrh(rscratch1, Address(rscratch1, ConstMethod::max_stack_offset()));
+  __ add(rscratch1, rscratch1, frame::interpreter_frame_monitor_size() + 2);
+  __ ldr(rscratch2,
+         Address(rfp, frame::interpreter_frame_initial_sp_offset * wordSize));
+  __ sub(rscratch1, rscratch2, rscratch1, ext::uxtx, 3);
+  __ andr(sp, rscratch1, -16);
+
+  // Restore expression stack pointer
+  __ ldr(esp, Address(rfp, frame::interpreter_frame_last_sp_offset * wordSize));
+  // NULL last_sp until next java call
+  __ str(zr, Address(rfp, frame::interpreter_frame_last_sp_offset * wordSize));
+
+  __ dispatch_next(state, step);
+  return entry;
+}
+
+
+int AbstractInterpreter::BasicType_as_index(BasicType type) {
+  int i = 0;
+  switch (type) {
+    case T_BOOLEAN: i = 0; break;
+    case T_CHAR   : i = 1; break;
+    case T_BYTE   : i = 2; break;
+    case T_SHORT  : i = 3; break;
+    case T_INT    : i = 4; break;
+    case T_LONG   : i = 5; break;
+    case T_VOID   : i = 6; break;
+    case T_FLOAT  : i = 7; break;
+    case T_DOUBLE : i = 8; break;
+    case T_OBJECT : i = 9; break;
+    case T_ARRAY  : i = 9; break;
+    default       : ShouldNotReachHere();
+  }
+  assert(0 <= i && i < AbstractInterpreter::number_of_result_handlers,
+         "index out of bounds");
+  return i;
+}
+
+
+address TemplateInterpreterGenerator::generate_result_handler_for(
+        BasicType type) {
+    address entry = __ pc();
+  switch (type) {
+  case T_BOOLEAN: __ uxtb(r0, r0);        break;
+  case T_CHAR   : __ uxth(r0, r0);       break;
+  case T_BYTE   : __ sxtb(r0, r0);        break;
+  case T_SHORT  : __ sxth(r0, r0);        break;
+  case T_INT    : __ uxtw(r0, r0);        break;  // FIXME: We almost certainly don't need this
+  case T_LONG   : /* nothing to do */        break;
+  case T_VOID   : /* nothing to do */        break;
+  case T_FLOAT  : /* nothing to do */        break;
+  case T_DOUBLE : /* nothing to do */        break;
+  case T_OBJECT :
+    // retrieve result from frame
+    __ ldr(r0, Address(rfp, frame::interpreter_frame_oop_temp_offset*wordSize));
+    // and verify it
+    __ verify_oop(r0);
+    break;
+  default       : ShouldNotReachHere();
+  }
+  __ ret(lr);                                  // return from result handler
+  return entry;
+}
+
+address TemplateInterpreterGenerator::generate_safept_entry_for(
+        TosState state,
+        address runtime_entry) {
+  address entry = __ pc();
+  __ push(state);
+  __ call_VM(noreg, runtime_entry);
+  __ membar(Assembler::AnyAny);
+  __ dispatch_via(vtos, Interpreter::_normal_table.table_for(vtos));
+  return entry;
+}
+
+// Helpers for commoning out cases in the various type of method entries.
+//
+
+
+// increment invocation count & check for overflow
+//
+// Note: checking for negative value instead of overflow
+//       so we have a 'sticky' overflow test
+//
+// rmethod: method
+//
+void InterpreterGenerator::generate_counter_incr(
+        Label* overflow,
+        Label* profile_method,
+        Label* profile_method_continue) {
+  Label done;
+  // Note: In tiered we increment either counters in Method* or in MDO depending if we're profiling or not.
+  if (TieredCompilation) {
+    int increment = InvocationCounter::count_increment;
+    int mask = ((1 << Tier0InvokeNotifyFreqLog)  - 1) << InvocationCounter::count_shift;
+    Label no_mdo;
+    if (ProfileInterpreter) {
+      // Are we profiling?
+      __ ldr(r0, Address(rmethod, Method::method_data_offset()));
+      __ cbz(r0, no_mdo);
+      // Increment counter in the MDO
+      const Address mdo_invocation_counter(r0, in_bytes(MethodData::invocation_counter_offset()) +
+                                                in_bytes(InvocationCounter::counter_offset()));
+      __ increment_mask_and_jump(mdo_invocation_counter, increment, mask, rscratch1, false, Assembler::EQ, overflow);
+      __ b(done);
+    }
+    __ bind(no_mdo);
+    // Increment counter in MethodCounters
+    const Address invocation_counter(rscratch2,
+                  MethodCounters::invocation_counter_offset() +
+                  InvocationCounter::counter_offset());
+    __ get_method_counters(rmethod, rscratch2, done);
+    __ increment_mask_and_jump(invocation_counter, increment, mask, rscratch1, false, Assembler::EQ, overflow);
+    __ bind(done);
+  } else {
+    const Address backedge_counter(rscratch2,
+                  MethodCounters::backedge_counter_offset() +
+                  InvocationCounter::counter_offset());
+    const Address invocation_counter(rscratch2,
+                  MethodCounters::invocation_counter_offset() +
+                  InvocationCounter::counter_offset());
+
+    __ get_method_counters(rmethod, rscratch2, done);
+
+    if (ProfileInterpreter) { // %%% Merge this into MethodData*
+      __ ldrw(r1, Address(rscratch2, MethodCounters::interpreter_invocation_counter_offset()));
+      __ addw(r1, r1, 1);
+      __ strw(r1, Address(rscratch2, MethodCounters::interpreter_invocation_counter_offset()));
+    }
+    // Update standard invocation counters
+    __ ldrw(r1, invocation_counter);
+    __ ldrw(r0, backedge_counter);
+
+    __ addw(r1, r1, InvocationCounter::count_increment);
+    __ andw(r0, r0, InvocationCounter::count_mask_value);
+
+    __ strw(r1, invocation_counter);
+    __ addw(r0, r0, r1);                // add both counters
+
+    // profile_method is non-null only for interpreted method so
+    // profile_method != NULL == !native_call
+
+    if (ProfileInterpreter && profile_method != NULL) {
+      // Test to see if we should create a method data oop
+      unsigned long offset;
+      __ adrp(rscratch2, ExternalAddress((address)&InvocationCounter::InterpreterProfileLimit),
+              offset);
+      __ ldrw(rscratch2, Address(rscratch2, offset));
+      __ cmp(r0, rscratch2);
+      __ br(Assembler::LT, *profile_method_continue);
+
+      // if no method data exists, go to profile_method
+      __ test_method_data_pointer(r0, *profile_method);
+    }
+
+    {
+      unsigned long offset;
+      __ adrp(rscratch2,
+              ExternalAddress((address)&InvocationCounter::InterpreterInvocationLimit),
+              offset);
+      __ ldrw(rscratch2, Address(rscratch2, offset));
+      __ cmpw(r0, rscratch2);
+      __ br(Assembler::HS, *overflow);
+    }
+    __ bind(done);
+  }
+}
+
+void InterpreterGenerator::generate_counter_overflow(Label* do_continue) {
+
+  // Asm interpreter on entry
+  // On return (i.e. jump to entry_point) [ back to invocation of interpreter ]
+  // Everything as it was on entry
+
+  // InterpreterRuntime::frequency_counter_overflow takes two
+  // arguments, the first (thread) is passed by call_VM, the second
+  // indicates if the counter overflow occurs at a backwards branch
+  // (NULL bcp).  We pass zero for it.  The call returns the address
+  // of the verified entry point for the method or NULL if the
+  // compilation did not complete (either went background or bailed
+  // out).
+  __ mov(c_rarg1, 0);
+  __ call_VM(noreg,
+             CAST_FROM_FN_PTR(address,
+                              InterpreterRuntime::frequency_counter_overflow),
+             c_rarg1);
+
+  __ b(*do_continue);
+}
+
+// See if we've got enough room on the stack for locals plus overhead.
+// The expression stack grows down incrementally, so the normal guard
+// page mechanism will work for that.
+//
+// NOTE: Since the additional locals are also always pushed (wasn't
+// obvious in generate_method_entry) so the guard should work for them
+// too.
+//
+// Args:
+//      r3: number of additional locals this frame needs (what we must check)
+//      rmethod: Method*
+//
+// Kills:
+//      r0
+void InterpreterGenerator::generate_stack_overflow_check(void) {
+
+  // monitor entry size: see picture of stack set
+  // (generate_method_entry) and frame_amd64.hpp
+  const int entry_size = frame::interpreter_frame_monitor_size() * wordSize;
+
+  // total overhead size: entry_size + (saved rbp through expr stack
+  // bottom).  be sure to change this if you add/subtract anything
+  // to/from the overhead area
+  const int overhead_size =
+    -(frame::interpreter_frame_initial_sp_offset * wordSize) + entry_size;
+
+  const int page_size = os::vm_page_size();
+
+  Label after_frame_check;
+
+  // see if the frame is greater than one page in size. If so,
+  // then we need to verify there is enough stack space remaining
+  // for the additional locals.
+  //
+  // Note that we use SUBS rather than CMP here because the immediate
+  // field of this instruction may overflow.  SUBS can cope with this
+  // because it is a macro that will expand to some number of MOV
+  // instructions and a register operation.
+  __ subs(rscratch1, r3, (page_size - overhead_size) / Interpreter::stackElementSize);
+  __ br(Assembler::LS, after_frame_check);
+
+  // compute rsp as if this were going to be the last frame on
+  // the stack before the red zone
+
+  const Address stack_base(rthread, Thread::stack_base_offset());
+  const Address stack_size(rthread, Thread::stack_size_offset());
+
+  // locals + overhead, in bytes
+  __ mov(r0, overhead_size);
+  __ add(r0, r0, r3, Assembler::LSL, Interpreter::logStackElementSize);  // 2 slots per parameter.
+
+  __ ldr(rscratch1, stack_base);
+  __ ldr(rscratch2, stack_size);
+
+#ifdef ASSERT
+  Label stack_base_okay, stack_size_okay;
+  // verify that thread stack base is non-zero
+  __ cbnz(rscratch1, stack_base_okay);
+  __ stop("stack base is zero");
+  __ bind(stack_base_okay);
+  // verify that thread stack size is non-zero
+  __ cbnz(rscratch2, stack_size_okay);
+  __ stop("stack size is zero");
+  __ bind(stack_size_okay);
+#endif
+
+  // Add stack base to locals and subtract stack size
+  __ sub(rscratch1, rscratch1, rscratch2); // Stack limit
+  __ add(r0, r0, rscratch1);
+
+  // Use the maximum number of pages we might bang.
+  const int max_pages = StackShadowPages > (StackRedPages+StackYellowPages) ? StackShadowPages :
+                                                                              (StackRedPages+StackYellowPages);
+
+  // add in the red and yellow zone sizes
+  __ add(r0, r0, max_pages * page_size * 2);
+
+  // check against the current stack bottom
+  __ cmp(sp, r0);
+  __ br(Assembler::HI, after_frame_check);
+
+  // Remove the incoming args, peeling the machine SP back to where it
+  // was in the caller.  This is not strictly necessary, but unless we
+  // do so the stack frame may have a garbage FP; this ensures a
+  // correct call stack that we can always unwind.  The ANDR should be
+  // unnecessary because the sender SP in r13 is always aligned, but
+  // it doesn't hurt.
+  __ andr(sp, r13, -16);
+
+  // Note: the restored frame is not necessarily interpreted.
+  // Use the shared runtime version of the StackOverflowError.
+  assert(StubRoutines::throw_StackOverflowError_entry() != NULL, "stub not yet generated");
+  __ far_jump(RuntimeAddress(StubRoutines::throw_StackOverflowError_entry()));
+
+  // all done with frame size check
+  __ bind(after_frame_check);
+}
+
+// Allocate monitor and lock method (asm interpreter)
+//
+// Args:
+//      rmethod: Method*
+//      rlocals: locals
+//
+// Kills:
+//      r0
+//      c_rarg0, c_rarg1, c_rarg2, c_rarg3, ...(param regs)
+//      rscratch1, rscratch2 (scratch regs)
+void InterpreterGenerator::lock_method(void) {
+  // synchronize method
+  const Address access_flags(rmethod, Method::access_flags_offset());
+  const Address monitor_block_top(
+        rfp,
+        frame::interpreter_frame_monitor_block_top_offset * wordSize);
+  const int entry_size = frame::interpreter_frame_monitor_size() * wordSize;
+
+#ifdef ASSERT
+  {
+    Label L;
+    __ ldrw(r0, access_flags);
+    __ tst(r0, JVM_ACC_SYNCHRONIZED);
+    __ br(Assembler::NE, L);
+    __ stop("method doesn't need synchronization");
+    __ bind(L);
+  }
+#endif // ASSERT
+
+  // get synchronization object
+  {
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
+    Label done;
+    __ ldrw(r0, access_flags);
+    __ tst(r0, JVM_ACC_STATIC);
+    // get receiver (assume this is frequent case)
+    __ ldr(r0, Address(rlocals, Interpreter::local_offset_in_bytes(0)));
+    __ br(Assembler::EQ, done);
+    __ ldr(r0, Address(rmethod, Method::const_offset()));
+    __ ldr(r0, Address(r0, ConstMethod::constants_offset()));
+    __ ldr(r0, Address(r0,
+                           ConstantPool::pool_holder_offset_in_bytes()));
+    __ ldr(r0, Address(r0, mirror_offset));
+
+#ifdef ASSERT
+    {
+      Label L;
+      __ cbnz(r0, L);
+      __ stop("synchronization object is NULL");
+      __ bind(L);
+    }
+#endif // ASSERT
+
+    __ bind(done);
+  }
+
+  // add space for monitor & lock
+  __ sub(sp, sp, entry_size); // add space for a monitor entry
+  __ sub(esp, esp, entry_size);
+  __ mov(rscratch1, esp);
+  __ str(rscratch1, monitor_block_top);  // set new monitor block top
+  // store object
+  __ str(r0, Address(esp, BasicObjectLock::obj_offset_in_bytes()));
+  __ mov(c_rarg1, esp); // object address
+  __ lock_object(c_rarg1);
+}
+
+// Generate a fixed interpreter frame. This is identical setup for
+// interpreted methods and for native methods hence the shared code.
+//
+// Args:
+//      lr: return address
+//      rmethod: Method*
+//      rlocals: pointer to locals
+//      rcpool: cp cache
+//      stack_pointer: previous sp
+void TemplateInterpreterGenerator::generate_fixed_frame(bool native_call) {
+  // initialize fixed part of activation frame
+  if (native_call) {
+    __ sub(esp, sp, 12 *  wordSize);
+    __ mov(rbcp, zr);
+    __ stp(esp, zr, Address(__ pre(sp, -12 * wordSize)));
+    // add 2 zero-initialized slots for native calls
+    __ stp(zr, zr, Address(sp, 10 * wordSize));
+  } else {
+    __ sub(esp, sp, 10 *  wordSize);
+    __ ldr(rscratch1, Address(rmethod, Method::const_offset()));      // get ConstMethod
+    __ add(rbcp, rscratch1, in_bytes(ConstMethod::codes_offset())); // get codebase
+    __ stp(esp, rbcp, Address(__ pre(sp, -10 * wordSize)));
+  }
+
+  if (ProfileInterpreter) {
+    Label method_data_continue;
+    __ ldr(rscratch1, Address(rmethod, Method::method_data_offset()));
+    __ cbz(rscratch1, method_data_continue);
+    __ lea(rscratch1, Address(rscratch1, in_bytes(MethodData::data_offset())));
+    __ bind(method_data_continue);
+    __ stp(rscratch1, rmethod, Address(sp, 4 * wordSize));  // save Method* and mdp (method data pointer)
+  } else {
+    __ stp(zr, rmethod, Address(sp, 4 * wordSize));        // save Method* (no mdp)
+  }
+
+  __ ldr(rcpool, Address(rmethod, Method::const_offset()));
+  __ ldr(rcpool, Address(rcpool, ConstMethod::constants_offset()));
+  __ ldr(rcpool, Address(rcpool, ConstantPool::cache_offset_in_bytes()));
+  __ stp(rlocals, rcpool, Address(sp, 2 * wordSize));
+
+  __ stp(rfp, lr, Address(sp, 8 * wordSize));
+  __ lea(rfp, Address(sp, 8 * wordSize));
+
+  // set sender sp
+  // leave last_sp as null
+  __ stp(zr, r13, Address(sp, 6 * wordSize));
+
+  // Move SP out of the way
+  if (! native_call) {
+    __ ldr(rscratch1, Address(rmethod, Method::const_offset()));
+    __ ldrh(rscratch1, Address(rscratch1, ConstMethod::max_stack_offset()));
+    __ add(rscratch1, rscratch1, frame::interpreter_frame_monitor_size() + 2);
+    __ sub(rscratch1, sp, rscratch1, ext::uxtw, 3);
+    __ andr(sp, rscratch1, -16);
+  }
+}
+
+// End of helpers
+
+// Various method entries
+//------------------------------------------------------------------------------------------------------------------------
+//
+//
+
+// Method entry for java.lang.ref.Reference.get.
+address InterpreterGenerator::generate_Reference_get_entry(void) {
+#if INCLUDE_ALL_GCS
+  // Code: _aload_0, _getfield, _areturn
+  // parameter size = 1
+  //
+  // The code that gets generated by this routine is split into 2 parts:
+  //    1. The "intrinsified" code for G1 (or any SATB based GC),
+  //    2. The slow path - which is an expansion of the regular method entry.
+  //
+  // Notes:-
+  // * In the G1 code we do not check whether we need to block for
+  //   a safepoint. If G1 is enabled then we must execute the specialized
+  //   code for Reference.get (except when the Reference object is null)
+  //   so that we can log the value in the referent field with an SATB
+  //   update buffer.
+  //   If the code for the getfield template is modified so that the
+  //   G1 pre-barrier code is executed when the current method is
+  //   Reference.get() then going through the normal method entry
+  //   will be fine.
+  // * The G1 code can, however, check the receiver object (the instance
+  //   of java.lang.Reference) and jump to the slow path if null. If the
+  //   Reference object is null then we obviously cannot fetch the referent
+  //   and so we don't need to call the G1 pre-barrier. Thus we can use the
+  //   regular method entry code to generate the NPE.
+  //
+  // This code is based on generate_accessor_enty.
+  //
+  // rmethod: Method*
+  // r13: senderSP must preserve for slow path, set SP to it on fast path
+
+  address entry = __ pc();
+
+  const int referent_offset = java_lang_ref_Reference::referent_offset;
+  guarantee(referent_offset > 0, "referent offset not initialized");
+
+  if (UseG1GC) {
+    Label slow_path;
+    const Register local_0 = c_rarg0;
+    // Check if local 0 != NULL
+    // If the receiver is null then it is OK to jump to the slow path.
+    __ ldr(local_0, Address(esp, 0));
+    __ cbz(local_0, slow_path);
+
+
+    // Load the value of the referent field.
+    const Address field_address(local_0, referent_offset);
+    __ load_heap_oop(local_0, field_address);
+
+    // Generate the G1 pre-barrier code to log the value of
+    // the referent field in an SATB buffer.
+    __ enter(); // g1_write may call runtime
+    __ g1_write_barrier_pre(noreg /* obj */,
+                            local_0 /* pre_val */,
+                            rthread /* thread */,
+                            rscratch2 /* tmp */,
+                            true /* tosca_live */,
+                            true /* expand_call */);
+    __ leave();
+    // areturn
+    __ andr(sp, r13, -16);  // done with stack
+    __ ret(lr);
+
+    // generate a vanilla interpreter entry as the slow path
+    __ bind(slow_path);
+    (void) generate_normal_entry(false);
+
+    return entry;
+  }
+#endif // INCLUDE_ALL_GCS
+
+  // If G1 is not enabled then attempt to go through the accessor entry point
+  // Reference.get is an accessor
+  return generate_accessor_entry();
+}
+
+/**
+ * Method entry for static native methods:
+ *   int java.util.zip.CRC32.update(int crc, int b)
+ */
+address InterpreterGenerator::generate_CRC32_update_entry() {
+  if (UseCRC32Intrinsics) {
+    address entry = __ pc();
+
+    // rmethod: Method*
+    // r13: senderSP must preserved for slow path
+    // esp: args
+
+    Label slow_path;
+    // If we need a safepoint check, generate full interpreter entry.
+    ExternalAddress state(SafepointSynchronize::address_of_state());
+    unsigned long offset;
+    __ adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
+    __ ldrw(rscratch1, Address(rscratch1, offset));
+    assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
+    __ cbnz(rscratch1, slow_path);
+
+    // We don't generate local frame and don't align stack because
+    // we call stub code and there is no safepoint on this path.
+
+    // Load parameters
+    const Register crc = c_rarg0;  // crc
+    const Register val = c_rarg1;  // source java byte value
+    const Register tbl = c_rarg2;  // scratch
+
+    // Arguments are reversed on java expression stack
+    __ ldrw(val, Address(esp, 0));              // byte value
+    __ ldrw(crc, Address(esp, wordSize));       // Initial CRC
+
+    __ adrp(tbl, ExternalAddress(StubRoutines::crc_table_addr()), offset);
+    __ add(tbl, tbl, offset);
+
+    __ ornw(crc, zr, crc); // ~crc
+    __ update_byte_crc32(crc, val, tbl);
+    __ ornw(crc, zr, crc); // ~crc
+
+    // result in c_rarg0
+
+    __ andr(sp, r13, -16);
+    __ ret(lr);
+
+    // generate a vanilla native entry as the slow path
+    __ bind(slow_path);
+
+    (void) generate_native_entry(false);
+
+    return entry;
+  }
+  return generate_native_entry(false);
+}
+
+/**
+ * Method entry for static native methods:
+ *   int java.util.zip.CRC32.updateBytes(int crc, byte[] b, int off, int len)
+ *   int java.util.zip.CRC32.updateByteBuffer(int crc, long buf, int off, int len)
+ */
+address InterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind) {
+  if (UseCRC32Intrinsics) {
+    address entry = __ pc();
+
+    // rmethod,: Method*
+    // r13: senderSP must preserved for slow path
+
+    Label slow_path;
+    // If we need a safepoint check, generate full interpreter entry.
+    ExternalAddress state(SafepointSynchronize::address_of_state());
+    unsigned long offset;
+    __ adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
+    __ ldrw(rscratch1, Address(rscratch1, offset));
+    assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
+    __ cbnz(rscratch1, slow_path);
+
+    // We don't generate local frame and don't align stack because
+    // we call stub code and there is no safepoint on this path.
+
+    // Load parameters
+    const Register crc = c_rarg0;  // crc
+    const Register buf = c_rarg1;  // source java byte array address
+    const Register len = c_rarg2;  // length
+    const Register off = len;      // offset (never overlaps with 'len')
+
+    // Arguments are reversed on java expression stack
+    // Calculate address of start element
+    if (kind == Interpreter::java_util_zip_CRC32_updateByteBuffer) {
+      __ ldr(buf, Address(esp, 2*wordSize)); // long buf
+      __ ldrw(off, Address(esp, wordSize)); // offset
+      __ add(buf, buf, off); // + offset
+      __ ldrw(crc,   Address(esp, 4*wordSize)); // Initial CRC
+    } else {
+      __ ldr(buf, Address(esp, 2*wordSize)); // byte[] array
+      __ add(buf, buf, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size
+      __ ldrw(off, Address(esp, wordSize)); // offset
+      __ add(buf, buf, off); // + offset
+      __ ldrw(crc,   Address(esp, 3*wordSize)); // Initial CRC
+    }
+    // Can now load 'len' since we're finished with 'off'
+    __ ldrw(len, Address(esp, 0x0)); // Length
+
+    __ andr(sp, r13, -16); // Restore the caller's SP
+
+    // We are frameless so we can just jump to the stub.
+    __ b(CAST_FROM_FN_PTR(address, StubRoutines::updateBytesCRC32()));
+
+    // generate a vanilla native entry as the slow path
+    __ bind(slow_path);
+
+    (void) generate_native_entry(false);
+
+    return entry;
+  }
+  return generate_native_entry(false);
+}
+
+void InterpreterGenerator::bang_stack_shadow_pages(bool native_call) {
+  // Bang each page in the shadow zone. We can't assume it's been done for
+  // an interpreter frame with greater than a page of locals, so each page
+  // needs to be checked.  Only true for non-native.
+  if (UseStackBanging) {
+    const int start_page = native_call ? StackShadowPages : 1;
+    const int page_size = os::vm_page_size();
+    for (int pages = start_page; pages <= StackShadowPages ; pages++) {
+      __ sub(rscratch2, sp, pages*page_size);
+      __ ldr(zr, Address(rscratch2));
+    }
+  }
+}
+
+
+// Interpreter stub for calling a native method. (asm interpreter)
+// This sets up a somewhat different looking stack for calling the
+// native method than the typical interpreter frame setup.
+address InterpreterGenerator::generate_native_entry(bool synchronized) {
+  // determine code generation flags
+  bool inc_counter  = UseCompiler || CountCompiledCalls;
+
+  // r1: Method*
+  // rscratch1: sender sp
+
+  address entry_point = __ pc();
+
+  const Address constMethod       (rmethod, Method::const_offset());
+  const Address access_flags      (rmethod, Method::access_flags_offset());
+  const Address size_of_parameters(r2, ConstMethod::
+                                       size_of_parameters_offset());
+
+  // get parameter size (always needed)
+  __ ldr(r2, constMethod);
+  __ load_unsigned_short(r2, size_of_parameters);
+
+  // native calls don't need the stack size check since they have no
+  // expression stack and the arguments are already on the stack and
+  // we only add a handful of words to the stack
+
+  // rmethod: Method*
+  // r2: size of parameters
+  // rscratch1: sender sp
+
+  // for natives the size of locals is zero
+
+  // compute beginning of parameters (rlocals)
+  __ add(rlocals, esp, r2, ext::uxtx, 3);
+  __ add(rlocals, rlocals, -wordSize);
+
+  // Pull SP back to minimum size: this avoids holes in the stack
+  __ andr(sp, esp, -16);
+
+  // initialize fixed part of activation frame
+  generate_fixed_frame(true);
+#ifndef PRODUCT
+  // tell the simulator that a method has been entered
+  if (NotifySimulator) {
+    __ notify(Assembler::method_entry);
+  }
+#endif
+
+  // make sure method is native & not abstract
+#ifdef ASSERT
+  __ ldrw(r0, access_flags);
+  {
+    Label L;
+    __ tst(r0, JVM_ACC_NATIVE);
+    __ br(Assembler::NE, L);
+    __ stop("tried to execute non-native method as native");
+    __ bind(L);
+  }
+  {
+    Label L;
+    __ tst(r0, JVM_ACC_ABSTRACT);
+    __ br(Assembler::EQ, L);
+    __ stop("tried to execute abstract method in interpreter");
+    __ bind(L);
+  }
+#endif
+
+  // Since at this point in the method invocation the exception
+  // handler would try to exit the monitor of synchronized methods
+  // which hasn't been entered yet, we set the thread local variable
+  // _do_not_unlock_if_synchronized to true. The remove_activation
+  // will check this flag.
+
+   const Address do_not_unlock_if_synchronized(rthread,
+        in_bytes(JavaThread::do_not_unlock_if_synchronized_offset()));
+  __ mov(rscratch2, true);
+  __ strb(rscratch2, do_not_unlock_if_synchronized);
+
+  // increment invocation count & check for overflow
+  Label invocation_counter_overflow;
+  if (inc_counter) {
+    generate_counter_incr(&invocation_counter_overflow, NULL, NULL);
+  }
+
+  Label continue_after_compile;
+  __ bind(continue_after_compile);
+
+  bang_stack_shadow_pages(true);
+
+  // reset the _do_not_unlock_if_synchronized flag
+  __ strb(zr, do_not_unlock_if_synchronized);
+
+  // check for synchronized methods
+  // Must happen AFTER invocation_counter check and stack overflow check,
+  // so method is not locked if overflows.
+  if (synchronized) {
+    lock_method();
+  } else {
+    // no synchronization necessary
+#ifdef ASSERT
+    {
+      Label L;
+      __ ldrw(r0, access_flags);
+      __ tst(r0, JVM_ACC_SYNCHRONIZED);
+      __ br(Assembler::EQ, L);
+      __ stop("method needs synchronization");
+      __ bind(L);
+    }
+#endif
+  }
+
+  // start execution
+#ifdef ASSERT
+  {
+    Label L;
+    const Address monitor_block_top(rfp,
+                 frame::interpreter_frame_monitor_block_top_offset * wordSize);
+    __ ldr(rscratch1, monitor_block_top);
+    __ cmp(esp, rscratch1);
+    __ br(Assembler::EQ, L);
+    __ stop("broken stack frame setup in interpreter");
+    __ bind(L);
+  }
+#endif
+
+  // jvmti support
+  __ notify_method_entry();
+
+  // work registers
+  const Register t = r17;
+  const Register result_handler = r19;
+
+  // allocate space for parameters
+  __ ldr(t, Address(rmethod, Method::const_offset()));
+  __ load_unsigned_short(t, Address(t, ConstMethod::size_of_parameters_offset()));
+
+  __ sub(rscratch1, esp, t, ext::uxtx, Interpreter::logStackElementSize);
+  __ andr(sp, rscratch1, -16);
+  __ mov(esp, rscratch1);
+
+  // get signature handler
+  {
+    Label L;
+    __ ldr(t, Address(rmethod, Method::signature_handler_offset()));
+    __ cbnz(t, L);
+    __ call_VM(noreg,
+               CAST_FROM_FN_PTR(address,
+                                InterpreterRuntime::prepare_native_call),
+               rmethod);
+    __ ldr(t, Address(rmethod, Method::signature_handler_offset()));
+    __ bind(L);
+  }
+
+  // call signature handler
+  assert(InterpreterRuntime::SignatureHandlerGenerator::from() == rlocals,
+         "adjust this code");
+  assert(InterpreterRuntime::SignatureHandlerGenerator::to() == sp,
+         "adjust this code");
+  assert(InterpreterRuntime::SignatureHandlerGenerator::temp() == rscratch1,
+          "adjust this code");
+
+  // The generated handlers do not touch rmethod (the method).
+  // However, large signatures cannot be cached and are generated
+  // each time here.  The slow-path generator can do a GC on return,
+  // so we must reload it after the call.
+  __ blr(t);
+  __ get_method(rmethod);        // slow path can do a GC, reload rmethod
+
+
+  // result handler is in r0
+  // set result handler
+  __ mov(result_handler, r0);
+  // pass mirror handle if static call
+  {
+    Label L;
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
+    __ ldrw(t, Address(rmethod, Method::access_flags_offset()));
+    __ tst(t, JVM_ACC_STATIC);
+    __ br(Assembler::EQ, L);
+    // get mirror
+    __ ldr(t, Address(rmethod, Method::const_offset()));
+    __ ldr(t, Address(t, ConstMethod::constants_offset()));
+    __ ldr(t, Address(t, ConstantPool::pool_holder_offset_in_bytes()));
+    __ ldr(t, Address(t, mirror_offset));
+    // copy mirror into activation frame
+    __ str(t, Address(rfp, frame::interpreter_frame_oop_temp_offset * wordSize));
+    // pass handle to mirror
+    __ add(c_rarg1, rfp, frame::interpreter_frame_oop_temp_offset * wordSize);
+    __ bind(L);
+  }
+
+  // get native function entry point in r10
+  {
+    Label L;
+    __ ldr(r10, Address(rmethod, Method::native_function_offset()));
+    address unsatisfied = (SharedRuntime::native_method_throw_unsatisfied_link_error_entry());
+    __ mov(rscratch2, unsatisfied);
+    __ ldr(rscratch2, rscratch2);
+    __ cmp(r10, rscratch2);
+    __ br(Assembler::NE, L);
+    __ call_VM(noreg,
+               CAST_FROM_FN_PTR(address,
+                                InterpreterRuntime::prepare_native_call),
+               rmethod);
+    __ get_method(rmethod);
+    __ ldr(r10, Address(rmethod, Method::native_function_offset()));
+    __ bind(L);
+  }
+
+  // pass JNIEnv
+  __ add(c_rarg0, rthread, in_bytes(JavaThread::jni_environment_offset()));
+
+  // It is enough that the pc() points into the right code
+  // segment. It does not have to be the correct return pc.
+  __ set_last_Java_frame(esp, rfp, (address)NULL, rscratch1);
+
+  // change thread state
+#ifdef ASSERT
+  {
+    Label L;
+    __ ldrw(t, Address(rthread, JavaThread::thread_state_offset()));
+    __ cmp(t, _thread_in_Java);
+    __ br(Assembler::EQ, L);
+    __ stop("Wrong thread state in native stub");
+    __ bind(L);
+  }
+#endif
+
+  // Change state to native
+  __ mov(rscratch1, _thread_in_native);
+  __ strw(rscratch1, Address(rthread, JavaThread::thread_state_offset()));
+
+  // Call the native method.
+  __ blrt(r10, rscratch1);
+  __ maybe_isb();
+  __ get_method(rmethod);
+  // result potentially in r0 or v0
+
+  // make room for the pushes we're about to do
+  __ sub(rscratch1, esp, 4 * wordSize);
+  __ andr(sp, rscratch1, -16);
+
+  // NOTE: The order of these pushes is known to frame::interpreter_frame_result
+  // in order to extract the result of a method call. If the order of these
+  // pushes change or anything else is added to the stack then the code in
+  // interpreter_frame_result must also change.
+  __ push(dtos);
+  __ push(ltos);
+
+  // change thread state
+  __ mov(rscratch1, _thread_in_native_trans);
+  __ strw(rscratch1, Address(rthread, JavaThread::thread_state_offset()));
+
+  if (os::is_MP()) {
+    if (UseMembar) {
+      // Force this write out before the read below
+      __ dsb(Assembler::SY);
+    } else {
+      // Write serialization page so VM thread can do a pseudo remote membar.
+      // We use the current thread pointer to calculate a thread specific
+      // offset to write to within the page. This minimizes bus traffic
+      // due to cache line collision.
+      __ serialize_memory(rthread, rscratch2);
+    }
+  }
+
+  // check for safepoint operation in progress and/or pending suspend requests
+  {
+    Label Continue;
+    {
+      unsigned long offset;
+      __ adrp(rscratch2, SafepointSynchronize::address_of_state(), offset);
+      __ ldrw(rscratch2, Address(rscratch2, offset));
+    }
+    assert(SafepointSynchronize::_not_synchronized == 0,
+           "SafepointSynchronize::_not_synchronized");
+    Label L;
+    __ cbnz(rscratch2, L);
+    __ ldrw(rscratch2, Address(rthread, JavaThread::suspend_flags_offset()));
+    __ cbz(rscratch2, Continue);
+    __ bind(L);
+
+    // Don't use call_VM as it will see a possible pending exception
+    // and forward it and never return here preventing us from
+    // clearing _last_native_pc down below. So we do a runtime call by
+    // hand.
+    //
+    __ mov(c_rarg0, rthread);
+    __ mov(rscratch2, CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans));
+    __ blrt(rscratch2, 1, 0, 0);
+    __ maybe_isb();
+    __ get_method(rmethod);
+    __ reinit_heapbase();
+    __ bind(Continue);
+  }
+
+  // change thread state
+  __ mov(rscratch1, _thread_in_Java);
+  __ strw(rscratch1, Address(rthread, JavaThread::thread_state_offset()));
+
+  // reset_last_Java_frame
+  __ reset_last_Java_frame(true, true);
+
+  // reset handle block
+  __ ldr(t, Address(rthread, JavaThread::active_handles_offset()));
+  __ str(zr, Address(t, JNIHandleBlock::top_offset_in_bytes()));
+
+  // If result is an oop unbox and store it in frame where gc will see it
+  // and result handler will pick it up
+
+  {
+    Label no_oop, store_result;
+    __ adr(t, ExternalAddress(AbstractInterpreter::result_handler(T_OBJECT)));
+    __ cmp(t, result_handler);
+    __ br(Assembler::NE, no_oop);
+    // retrieve result
+    __ pop(ltos);
+    __ cbz(r0, store_result);
+    __ ldr(r0, Address(r0, 0));
+    __ bind(store_result);
+    __ str(r0, Address(rfp, frame::interpreter_frame_oop_temp_offset*wordSize));
+    // keep stack depth as expected by pushing oop which will eventually be discarded
+    __ push(ltos);
+    __ bind(no_oop);
+  }
+
+  {
+    Label no_reguard;
+    __ lea(rscratch1, Address(rthread, in_bytes(JavaThread::stack_guard_state_offset())));
+    __ ldrb(rscratch1, Address(rscratch1));
+    __ cmp(rscratch1, JavaThread::stack_guard_yellow_disabled);
+    __ br(Assembler::NE, no_reguard);
+
+    __ pusha(); // XXX only save smashed registers
+    __ mov(c_rarg0, rthread);
+    __ mov(rscratch2, CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages));
+    __ blrt(rscratch2, 0, 0, 0);
+    __ popa(); // XXX only restore smashed registers
+    __ bind(no_reguard);
+  }
+
+  // The method register is junk from after the thread_in_native transition
+  // until here.  Also can't call_VM until the bcp has been
+  // restored.  Need bcp for throwing exception below so get it now.
+  __ get_method(rmethod);
+
+  // restore bcp to have legal interpreter frame, i.e., bci == 0 <=>
+  // rbcp == code_base()
+  __ ldr(rbcp, Address(rmethod, Method::const_offset()));   // get ConstMethod*
+  __ add(rbcp, rbcp, in_bytes(ConstMethod::codes_offset()));          // get codebase
+  // handle exceptions (exception handling will handle unlocking!)
+  {
+    Label L;
+    __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+    __ cbz(rscratch1, L);
+    // Note: At some point we may want to unify this with the code
+    // used in call_VM_base(); i.e., we should use the
+    // StubRoutines::forward_exception code. For now this doesn't work
+    // here because the rsp is not correctly set at this point.
+    __ MacroAssembler::call_VM(noreg,
+                               CAST_FROM_FN_PTR(address,
+                               InterpreterRuntime::throw_pending_exception));
+    __ should_not_reach_here();
+    __ bind(L);
+  }
+
+  // do unlocking if necessary
+  {
+    Label L;
+    __ ldrw(t, Address(rmethod, Method::access_flags_offset()));
+    __ tst(t, JVM_ACC_SYNCHRONIZED);
+    __ br(Assembler::EQ, L);
+    // the code below should be shared with interpreter macro
+    // assembler implementation
+    {
+      Label unlock;
+      // BasicObjectLock will be first in list, since this is a
+      // synchronized method. However, need to check that the object
+      // has not been unlocked by an explicit monitorexit bytecode.
+
+      // monitor expect in c_rarg1 for slow unlock path
+      __ lea (c_rarg1, Address(rfp,   // address of first monitor
+                               (intptr_t)(frame::interpreter_frame_initial_sp_offset *
+                                          wordSize - sizeof(BasicObjectLock))));
+
+      __ ldr(t, Address(c_rarg1, BasicObjectLock::obj_offset_in_bytes()));
+      __ cbnz(t, unlock);
+
+      // Entry already unlocked, need to throw exception
+      __ MacroAssembler::call_VM(noreg,
+                                 CAST_FROM_FN_PTR(address,
+                   InterpreterRuntime::throw_illegal_monitor_state_exception));
+      __ should_not_reach_here();
+
+      __ bind(unlock);
+      __ unlock_object(c_rarg1);
+    }
+    __ bind(L);
+  }
+
+  // jvmti support
+  // Note: This must happen _after_ handling/throwing any exceptions since
+  //       the exception handler code notifies the runtime of method exits
+  //       too. If this happens before, method entry/exit notifications are
+  //       not properly paired (was bug - gri 11/22/99).
+  __ notify_method_exit(vtos, InterpreterMacroAssembler::NotifyJVMTI);
+
+  // restore potential result in r0:d0, call result handler to
+  // restore potential result in ST0 & handle result
+
+  __ pop(ltos);
+  __ pop(dtos);
+
+  __ blr(result_handler);
+
+  // remove activation
+  __ ldr(esp, Address(rfp,
+                    frame::interpreter_frame_sender_sp_offset *
+                    wordSize)); // get sender sp
+  // remove frame anchor
+  __ leave();
+
+  // resture sender sp
+  __ mov(sp, esp);
+
+  __ ret(lr);
+
+  if (inc_counter) {
+    // Handle overflow of counter and compile method
+    __ bind(invocation_counter_overflow);
+    generate_counter_overflow(&continue_after_compile);
+  }
+
+  return entry_point;
+}
+
+//
+// Generic interpreted method entry to (asm) interpreter
+//
+address InterpreterGenerator::generate_normal_entry(bool synchronized) {
+  // determine code generation flags
+  bool inc_counter  = UseCompiler || CountCompiledCalls;
+
+  // rscratch1: sender sp
+  address entry_point = __ pc();
+
+  const Address constMethod(rmethod, Method::const_offset());
+  const Address access_flags(rmethod, Method::access_flags_offset());
+  const Address size_of_parameters(r3,
+                                   ConstMethod::size_of_parameters_offset());
+  const Address size_of_locals(r3, ConstMethod::size_of_locals_offset());
+
+  // get parameter size (always needed)
+  // need to load the const method first
+  __ ldr(r3, constMethod);
+  __ load_unsigned_short(r2, size_of_parameters);
+
+  // r2: size of parameters
+
+  __ load_unsigned_short(r3, size_of_locals); // get size of locals in words
+  __ sub(r3, r3, r2); // r3 = no. of additional locals
+
+  // see if we've got enough room on the stack for locals plus overhead.
+  generate_stack_overflow_check();
+
+  // compute beginning of parameters (rlocals)
+  __ add(rlocals, esp, r2, ext::uxtx, 3);
+  __ sub(rlocals, rlocals, wordSize);
+
+  // Make room for locals
+  __ sub(rscratch1, esp, r3, ext::uxtx, 3);
+  __ andr(sp, rscratch1, -16);
+
+  // r3 - # of additional locals
+  // allocate space for locals
+  // explicitly initialize locals
+  {
+    Label exit, loop;
+    __ ands(zr, r3, r3);
+    __ br(Assembler::LE, exit); // do nothing if r3 <= 0
+    __ bind(loop);
+    __ str(zr, Address(__ post(rscratch1, wordSize)));
+    __ sub(r3, r3, 1); // until everything initialized
+    __ cbnz(r3, loop);
+    __ bind(exit);
+  }
+
+  // And the base dispatch table
+  __ get_dispatch();
+
+  // initialize fixed part of activation frame
+  generate_fixed_frame(false);
+#ifndef PRODUCT
+  // tell the simulator that a method has been entered
+  if (NotifySimulator) {
+    __ notify(Assembler::method_entry);
+  }
+#endif
+  // make sure method is not native & not abstract
+#ifdef ASSERT
+  __ ldrw(r0, access_flags);
+  {
+    Label L;
+    __ tst(r0, JVM_ACC_NATIVE);
+    __ br(Assembler::EQ, L);
+    __ stop("tried to execute native method as non-native");
+    __ bind(L);
+  }
+ {
+    Label L;
+    __ tst(r0, JVM_ACC_ABSTRACT);
+    __ br(Assembler::EQ, L);
+    __ stop("tried to execute abstract method in interpreter");
+    __ bind(L);
+  }
+#endif
+
+  // Since at this point in the method invocation the exception
+  // handler would try to exit the monitor of synchronized methods
+  // which hasn't been entered yet, we set the thread local variable
+  // _do_not_unlock_if_synchronized to true. The remove_activation
+  // will check this flag.
+
+   const Address do_not_unlock_if_synchronized(rthread,
+        in_bytes(JavaThread::do_not_unlock_if_synchronized_offset()));
+  __ mov(rscratch2, true);
+  __ strb(rscratch2, do_not_unlock_if_synchronized);
+
+  // increment invocation count & check for overflow
+  Label invocation_counter_overflow;
+  Label profile_method;
+  Label profile_method_continue;
+  if (inc_counter) {
+    generate_counter_incr(&invocation_counter_overflow,
+                          &profile_method,
+                          &profile_method_continue);
+    if (ProfileInterpreter) {
+      __ bind(profile_method_continue);
+    }
+  }
+
+  Label continue_after_compile;
+  __ bind(continue_after_compile);
+
+  bang_stack_shadow_pages(false);
+
+  // reset the _do_not_unlock_if_synchronized flag
+  __ strb(zr, do_not_unlock_if_synchronized);
+
+  // check for synchronized methods
+  // Must happen AFTER invocation_counter check and stack overflow check,
+  // so method is not locked if overflows.
+  if (synchronized) {
+    // Allocate monitor and lock method
+    lock_method();
+  } else {
+    // no synchronization necessary
+#ifdef ASSERT
+    {
+      Label L;
+      __ ldrw(r0, access_flags);
+      __ tst(r0, JVM_ACC_SYNCHRONIZED);
+      __ br(Assembler::EQ, L);
+      __ stop("method needs synchronization");
+      __ bind(L);
+    }
+#endif
+  }
+
+  // start execution
+#ifdef ASSERT
+  {
+    Label L;
+     const Address monitor_block_top (rfp,
+                 frame::interpreter_frame_monitor_block_top_offset * wordSize);
+    __ ldr(rscratch1, monitor_block_top);
+    __ cmp(esp, rscratch1);
+    __ br(Assembler::EQ, L);
+    __ stop("broken stack frame setup in interpreter");
+    __ bind(L);
+  }
+#endif
+
+  // jvmti support
+  __ notify_method_entry();
+
+  __ dispatch_next(vtos);
+
+  // invocation counter overflow
+  if (inc_counter) {
+    if (ProfileInterpreter) {
+      // We have decided to profile this method in the interpreter
+      __ bind(profile_method);
+      __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::profile_method));
+      __ set_method_data_pointer_for_bcp();
+      // don't think we need this
+      __ get_method(r1);
+      __ b(profile_method_continue);
+    }
+    // Handle overflow of counter and compile method
+    __ bind(invocation_counter_overflow);
+    generate_counter_overflow(&continue_after_compile);
+  }
+
+  return entry_point;
+}
+
+// These should never be compiled since the interpreter will prefer
+// the compiled version to the intrinsic version.
+bool AbstractInterpreter::can_be_compiled(methodHandle m) {
+  switch (method_kind(m)) {
+    case Interpreter::java_lang_math_sin     : // fall thru
+    case Interpreter::java_lang_math_cos     : // fall thru
+    case Interpreter::java_lang_math_tan     : // fall thru
+    case Interpreter::java_lang_math_abs     : // fall thru
+    case Interpreter::java_lang_math_log     : // fall thru
+    case Interpreter::java_lang_math_log10   : // fall thru
+    case Interpreter::java_lang_math_sqrt    : // fall thru
+    case Interpreter::java_lang_math_pow     : // fall thru
+    case Interpreter::java_lang_math_exp     :
+      return false;
+    default:
+      return true;
+  }
+}
+
+// How much stack a method activation needs in words.
+int AbstractInterpreter::size_top_interpreter_activation(Method* method) {
+  const int entry_size = frame::interpreter_frame_monitor_size();
+
+  // total overhead size: entry_size + (saved rfp thru expr stack
+  // bottom).  be sure to change this if you add/subtract anything
+  // to/from the overhead area
+  const int overhead_size =
+    -(frame::interpreter_frame_initial_sp_offset) + entry_size;
+
+  const int stub_code = frame::entry_frame_after_call_words;
+  const int method_stack = (method->max_locals() + method->max_stack()) *
+                           Interpreter::stackElementWords;
+  return (overhead_size + method_stack + stub_code);
+}
+
+// asm based interpreter deoptimization helpers
+int AbstractInterpreter::size_activation(int max_stack,
+                                         int temps,
+                                         int extra_args,
+                                         int monitors,
+                                         int callee_params,
+                                         int callee_locals,
+                                         bool is_top_frame) {
+  // Note: This calculation must exactly parallel the frame setup
+  // in InterpreterGenerator::generate_method_entry.
+
+  // fixed size of an interpreter frame:
+  int overhead = frame::sender_sp_offset -
+                 frame::interpreter_frame_initial_sp_offset;
+  // Our locals were accounted for by the caller (or last_frame_adjust
+  // on the transistion) Since the callee parameters already account
+  // for the callee's params we only need to account for the extra
+  // locals.
+  int size = overhead +
+         (callee_locals - callee_params)*Interpreter::stackElementWords +
+         monitors * frame::interpreter_frame_monitor_size() +
+         temps* Interpreter::stackElementWords + extra_args;
+
+  // On AArch64 we always keep the stack pointer 16-aligned, so we
+  // must round up here.
+  size = round_to(size, 2);
+
+  return size;
+}
+
+void AbstractInterpreter::layout_activation(Method* method,
+                                            int tempcount,
+                                            int popframe_extra_args,
+                                            int moncount,
+                                            int caller_actual_parameters,
+                                            int callee_param_count,
+                                            int callee_locals,
+                                            frame* caller,
+                                            frame* interpreter_frame,
+                                            bool is_top_frame,
+                                            bool is_bottom_frame) {
+  // The frame interpreter_frame is guaranteed to be the right size,
+  // as determined by a previous call to the size_activation() method.
+  // It is also guaranteed to be walkable even though it is in a
+  // skeletal state
+
+  int max_locals = method->max_locals() * Interpreter::stackElementWords;
+  int extra_locals = (method->max_locals() - method->size_of_parameters()) *
+    Interpreter::stackElementWords;
+
+#ifdef ASSERT
+  assert(caller->sp() == interpreter_frame->sender_sp(), "Frame not properly walkable");
+#endif
+
+  interpreter_frame->interpreter_frame_set_method(method);
+  // NOTE the difference in using sender_sp and
+  // interpreter_frame_sender_sp interpreter_frame_sender_sp is
+  // the original sp of the caller (the unextended_sp) and
+  // sender_sp is fp+8/16 (32bit/64bit) XXX
+  intptr_t* locals = interpreter_frame->sender_sp() + max_locals - 1;
+
+#ifdef ASSERT
+  if (caller->is_interpreted_frame()) {
+    assert(locals < caller->fp() + frame::interpreter_frame_initial_sp_offset, "bad placement");
+  }
+#endif
+
+  interpreter_frame->interpreter_frame_set_locals(locals);
+  BasicObjectLock* montop = interpreter_frame->interpreter_frame_monitor_begin();
+  BasicObjectLock* monbot = montop - moncount;
+  interpreter_frame->interpreter_frame_set_monitor_end(monbot);
+
+  // Set last_sp
+  intptr_t*  esp = (intptr_t*) monbot -
+    tempcount*Interpreter::stackElementWords -
+    popframe_extra_args;
+  interpreter_frame->interpreter_frame_set_last_sp(esp);
+
+  // All frames but the initial (oldest) interpreter frame we fill in have
+  // a value for sender_sp that allows walking the stack but isn't
+  // truly correct. Correct the value here.
+  if (extra_locals != 0 &&
+      interpreter_frame->sender_sp() ==
+      interpreter_frame->interpreter_frame_sender_sp()) {
+    interpreter_frame->set_interpreter_frame_sender_sp(caller->sp() +
+                                                       extra_locals);
+  }
+  *interpreter_frame->interpreter_frame_cache_addr() =
+    method->constants()->cache();
+}
+
+
+//-----------------------------------------------------------------------------
+// Exceptions
+
+void TemplateInterpreterGenerator::generate_throw_exception() {
+  // Entry point in previous activation (i.e., if the caller was
+  // interpreted)
+  Interpreter::_rethrow_exception_entry = __ pc();
+  // Restore sp to interpreter_frame_last_sp even though we are going
+  // to empty the expression stack for the exception processing.
+  __ str(zr, Address(rfp, frame::interpreter_frame_last_sp_offset * wordSize));
+  // r0: exception
+  // r3: return address/pc that threw exception
+  __ restore_bcp();    // rbcp points to call/send
+  __ restore_locals();
+  __ restore_constant_pool_cache();
+  __ reinit_heapbase();  // restore rheapbase as heapbase.
+  __ get_dispatch();
+
+#ifndef PRODUCT
+  // tell the simulator that the caller method has been reentered
+  if (NotifySimulator) {
+    __ get_method(rmethod);
+    __ notify(Assembler::method_reentry);
+  }
+#endif
+  // Entry point for exceptions thrown within interpreter code
+  Interpreter::_throw_exception_entry = __ pc();
+  // If we came here via a NullPointerException on the receiver of a
+  // method, rmethod may be corrupt.
+  __ get_method(rmethod);
+  // expression stack is undefined here
+  // r0: exception
+  // rbcp: exception bcp
+  __ verify_oop(r0);
+  __ mov(c_rarg1, r0);
+
+  // expression stack must be empty before entering the VM in case of
+  // an exception
+  __ empty_expression_stack();
+  // find exception handler address and preserve exception oop
+  __ call_VM(r3,
+             CAST_FROM_FN_PTR(address,
+                          InterpreterRuntime::exception_handler_for_exception),
+             c_rarg1);
+
+  // Calculate stack limit
+  __ ldr(rscratch1, Address(rmethod, Method::const_offset()));
+  __ ldrh(rscratch1, Address(rscratch1, ConstMethod::max_stack_offset()));
+  __ add(rscratch1, rscratch1, frame::interpreter_frame_monitor_size() + 4);
+  __ ldr(rscratch2,
+         Address(rfp, frame::interpreter_frame_initial_sp_offset * wordSize));
+  __ sub(rscratch1, rscratch2, rscratch1, ext::uxtx, 3);
+  __ andr(sp, rscratch1, -16);
+
+  // r0: exception handler entry point
+  // r3: preserved exception oop
+  // rbcp: bcp for exception handler
+  __ push_ptr(r3); // push exception which is now the only value on the stack
+  __ br(r0); // jump to exception handler (may be _remove_activation_entry!)
+
+  // If the exception is not handled in the current frame the frame is
+  // removed and the exception is rethrown (i.e. exception
+  // continuation is _rethrow_exception).
+  //
+  // Note: At this point the bci is still the bxi for the instruction
+  // which caused the exception and the expression stack is
+  // empty. Thus, for any VM calls at this point, GC will find a legal
+  // oop map (with empty expression stack).
+
+  //
+  // JVMTI PopFrame support
+  //
+
+  Interpreter::_remove_activation_preserving_args_entry = __ pc();
+  __ empty_expression_stack();
+  // Set the popframe_processing bit in pending_popframe_condition
+  // indicating that we are currently handling popframe, so that
+  // call_VMs that may happen later do not trigger new popframe
+  // handling cycles.
+  __ ldrw(r3, Address(rthread, JavaThread::popframe_condition_offset()));
+  __ orr(r3, r3, JavaThread::popframe_processing_bit);
+  __ strw(r3, Address(rthread, JavaThread::popframe_condition_offset()));
+
+  {
+    // Check to see whether we are returning to a deoptimized frame.
+    // (The PopFrame call ensures that the caller of the popped frame is
+    // either interpreted or compiled and deoptimizes it if compiled.)
+    // In this case, we can't call dispatch_next() after the frame is
+    // popped, but instead must save the incoming arguments and restore
+    // them after deoptimization has occurred.
+    //
+    // Note that we don't compare the return PC against the
+    // deoptimization blob's unpack entry because of the presence of
+    // adapter frames in C2.
+    Label caller_not_deoptimized;
+    __ ldr(c_rarg1, Address(rfp, frame::return_addr_offset * wordSize));
+    __ super_call_VM_leaf(CAST_FROM_FN_PTR(address,
+                               InterpreterRuntime::interpreter_contains), c_rarg1);
+    __ cbnz(r0, caller_not_deoptimized);
+
+    // Compute size of arguments for saving when returning to
+    // deoptimized caller
+    __ get_method(r0);
+    __ ldr(r0, Address(r0, Method::const_offset()));
+    __ load_unsigned_short(r0, Address(r0, in_bytes(ConstMethod::
+                                                    size_of_parameters_offset())));
+    __ lsl(r0, r0, Interpreter::logStackElementSize);
+    __ restore_locals(); // XXX do we need this?
+    __ sub(rlocals, rlocals, r0);
+    __ add(rlocals, rlocals, wordSize);
+    // Save these arguments
+    __ super_call_VM_leaf(CAST_FROM_FN_PTR(address,
+                                           Deoptimization::
+                                           popframe_preserve_args),
+                          rthread, r0, rlocals);
+
+    __ remove_activation(vtos,
+                         /* throw_monitor_exception */ false,
+                         /* install_monitor_exception */ false,
+                         /* notify_jvmdi */ false);
+
+    // Inform deoptimization that it is responsible for restoring
+    // these arguments
+    __ mov(rscratch1, JavaThread::popframe_force_deopt_reexecution_bit);
+    __ strw(rscratch1, Address(rthread, JavaThread::popframe_condition_offset()));
+
+    // Continue in deoptimization handler
+    __ ret(lr);
+
+    __ bind(caller_not_deoptimized);
+  }
+
+  __ remove_activation(vtos,
+                       /* throw_monitor_exception */ false,
+                       /* install_monitor_exception */ false,
+                       /* notify_jvmdi */ false);
+
+  // Restore the last_sp and null it out
+  __ ldr(esp, Address(rfp, frame::interpreter_frame_last_sp_offset * wordSize));
+  __ str(zr, Address(rfp, frame::interpreter_frame_last_sp_offset * wordSize));
+
+  __ restore_bcp();
+  __ restore_locals();
+  __ restore_constant_pool_cache();
+  __ get_method(rmethod);
+
+  // The method data pointer was incremented already during
+  // call profiling. We have to restore the mdp for the current bcp.
+  if (ProfileInterpreter) {
+    __ set_method_data_pointer_for_bcp();
+  }
+
+  // Clear the popframe condition flag
+  __ strw(zr, Address(rthread, JavaThread::popframe_condition_offset()));
+  assert(JavaThread::popframe_inactive == 0, "fix popframe_inactive");
+
+#if INCLUDE_JVMTI
+  {
+    Label L_done;
+
+    __ ldrb(rscratch1, Address(rbcp, 0));
+    __ cmpw(r1, Bytecodes::_invokestatic);
+    __ br(Assembler::EQ, L_done);
+
+    // The member name argument must be restored if _invokestatic is re-executed after a PopFrame call.
+    // Detect such a case in the InterpreterRuntime function and return the member name argument, or NULL.
+
+    __ ldr(c_rarg0, Address(rlocals, 0));
+    __ call_VM(r0, CAST_FROM_FN_PTR(address, InterpreterRuntime::member_name_arg_or_null), c_rarg0, rmethod, rbcp);
+
+    __ cbz(r0, L_done);
+
+    __ str(r0, Address(esp, 0));
+    __ bind(L_done);
+  }
+#endif // INCLUDE_JVMTI
+
+  // Restore machine SP
+  __ ldr(rscratch1, Address(rmethod, Method::const_offset()));
+  __ ldrh(rscratch1, Address(rscratch1, ConstMethod::max_stack_offset()));
+  __ add(rscratch1, rscratch1, frame::interpreter_frame_monitor_size() + 4);
+  __ ldr(rscratch2,
+         Address(rfp, frame::interpreter_frame_initial_sp_offset * wordSize));
+  __ sub(rscratch1, rscratch2, rscratch1, ext::uxtw, 3);
+  __ andr(sp, rscratch1, -16);
+
+  __ dispatch_next(vtos);
+  // end of PopFrame support
+
+  Interpreter::_remove_activation_entry = __ pc();
+
+  // preserve exception over this code sequence
+  __ pop_ptr(r0);
+  __ str(r0, Address(rthread, JavaThread::vm_result_offset()));
+  // remove the activation (without doing throws on illegalMonitorExceptions)
+  __ remove_activation(vtos, false, true, false);
+  // restore exception
+  // restore exception
+  __ get_vm_result(r0, rthread);
+
+  // In between activations - previous activation type unknown yet
+  // compute continuation point - the continuation point expects the
+  // following registers set up:
+  //
+  // r0: exception
+  // lr: return address/pc that threw exception
+  // rsp: expression stack of caller
+  // rfp: fp of caller
+  // FIXME: There's no point saving LR here because VM calls don't trash it
+  __ stp(r0, lr, Address(__ pre(sp, -2 * wordSize)));  // save exception & return address
+  __ super_call_VM_leaf(CAST_FROM_FN_PTR(address,
+                          SharedRuntime::exception_handler_for_return_address),
+                        rthread, lr);
+  __ mov(r1, r0);                               // save exception handler
+  __ ldp(r0, lr, Address(__ post(sp, 2 * wordSize)));  // restore exception & return address
+  // We might be returning to a deopt handler that expects r3 to
+  // contain the exception pc
+  __ mov(r3, lr);
+  // Note that an "issuing PC" is actually the next PC after the call
+  __ br(r1);                                    // jump to exception
+                                                // handler of caller
+}
+
+
+//
+// JVMTI ForceEarlyReturn support
+//
+address TemplateInterpreterGenerator::generate_earlyret_entry_for(TosState state) {
+  address entry = __ pc();
+
+  __ restore_bcp();
+  __ restore_locals();
+  __ empty_expression_stack();
+  __ load_earlyret_value(state);
+
+  __ ldr(rscratch1, Address(rthread, JavaThread::jvmti_thread_state_offset()));
+  Address cond_addr(rscratch1, JvmtiThreadState::earlyret_state_offset());
+
+  // Clear the earlyret state
+  assert(JvmtiThreadState::earlyret_inactive == 0, "should be");
+  __ str(zr, cond_addr);
+
+  __ remove_activation(state,
+                       false, /* throw_monitor_exception */
+                       false, /* install_monitor_exception */
+                       true); /* notify_jvmdi */
+  __ ret(lr);
+
+  return entry;
+} // end of ForceEarlyReturn support
+
+
+
+//-----------------------------------------------------------------------------
+// Helper for vtos entry point generation
+
+void TemplateInterpreterGenerator::set_vtos_entry_points(Template* t,
+                                                         address& bep,
+                                                         address& cep,
+                                                         address& sep,
+                                                         address& aep,
+                                                         address& iep,
+                                                         address& lep,
+                                                         address& fep,
+                                                         address& dep,
+                                                         address& vep) {
+  assert(t->is_valid() && t->tos_in() == vtos, "illegal template");
+  Label L;
+  aep = __ pc();  __ push_ptr();  __ b(L);
+  fep = __ pc();  __ push_f();    __ b(L);
+  dep = __ pc();  __ push_d();    __ b(L);
+  lep = __ pc();  __ push_l();    __ b(L);
+  bep = cep = sep =
+  iep = __ pc();  __ push_i();
+  vep = __ pc();
+  __ bind(L);
+  generate_and_dispatch(t);
+}
+
+//-----------------------------------------------------------------------------
+// Generation of individual instructions
+
+// helpers for generate_and_dispatch
+
+
+InterpreterGenerator::InterpreterGenerator(StubQueue* code)
+  : TemplateInterpreterGenerator(code) {
+   generate_all(); // down here so it can be "virtual"
+}
+
+//-----------------------------------------------------------------------------
+
+// Non-product code
+#ifndef PRODUCT
+address TemplateInterpreterGenerator::generate_trace_code(TosState state) {
+  address entry = __ pc();
+
+  __ push(lr);
+  __ push(state);
+  __ push(RegSet::range(r0, r15), sp);
+  __ mov(c_rarg2, r0);  // Pass itos
+  __ call_VM(noreg,
+             CAST_FROM_FN_PTR(address, SharedRuntime::trace_bytecode),
+             c_rarg1, c_rarg2, c_rarg3);
+  __ pop(RegSet::range(r0, r15), sp);
+  __ pop(state);
+  __ pop(lr);
+  __ ret(lr);                                   // return from result handler
+
+  return entry;
+}
+
+void TemplateInterpreterGenerator::count_bytecode() {
+  __ push(rscratch1);
+  __ push(rscratch2);
+  Label L;
+  __ mov(rscratch2, (address) &BytecodeCounter::_counter_value);
+  __ bind(L);
+  __ ldxr(rscratch1, rscratch2);
+  __ add(rscratch1, rscratch1, 1);
+  __ stxr(rscratch1, rscratch1, rscratch2);
+  __ cbnzw(rscratch1, L);
+  __ pop(rscratch2);
+  __ pop(rscratch1);
+}
+
+void TemplateInterpreterGenerator::histogram_bytecode(Template* t) { ; }
+
+void TemplateInterpreterGenerator::histogram_bytecode_pair(Template* t) { ; }
+
+
+void TemplateInterpreterGenerator::trace_bytecode(Template* t) {
+  // Call a little run-time stub to avoid blow-up for each bytecode.
+  // The run-time runtime saves the right registers, depending on
+  // the tosca in-state for the given template.
+
+  assert(Interpreter::trace_code(t->tos_in()) != NULL,
+         "entry must have been generated");
+  __ bl(Interpreter::trace_code(t->tos_in()));
+  __ reinit_heapbase();
+}
+
+
+void TemplateInterpreterGenerator::stop_interpreter_at() {
+  Label L;
+  __ push(rscratch1);
+  __ mov(rscratch1, (address) &BytecodeCounter::_counter_value);
+  __ ldr(rscratch1, Address(rscratch1));
+  __ mov(rscratch2, StopInterpreterAt);
+  __ cmpw(rscratch1, rscratch2);
+  __ br(Assembler::NE, L);
+  __ brk(0);
+  __ bind(L);
+  __ pop(rscratch1);
+}
+
+#ifdef BUILTIN_SIM
+
+#include <sys/mman.h>
+#include <unistd.h>
+
+extern "C" {
+  static int PAGESIZE = getpagesize();
+  int is_mapped_address(u_int64_t address)
+  {
+    address = (address & ~((u_int64_t)PAGESIZE - 1));
+    if (msync((void *)address, PAGESIZE, MS_ASYNC) == 0) {
+      return true;
+    }
+    if (errno != ENOMEM) {
+      return true;
+    }
+    return false;
+  }
+
+  void bccheck1(u_int64_t pc, u_int64_t fp, char *method, int *bcidx, int *framesize, char *decode)
+  {
+    if (method != 0) {
+      method[0] = '\0';
+    }
+    if (bcidx != 0) {
+      *bcidx = -2;
+    }
+    if (decode != 0) {
+      decode[0] = 0;
+    }
+
+    if (framesize != 0) {
+      *framesize = -1;
+    }
+
+    if (Interpreter::contains((address)pc)) {
+      AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
+      Method* meth;
+      address bcp;
+      if (fp) {
+#define FRAME_SLOT_METHOD 3
+#define FRAME_SLOT_BCP 7
+        meth = (Method*)sim->getMemory()->loadU64(fp - (FRAME_SLOT_METHOD << 3));
+        bcp = (address)sim->getMemory()->loadU64(fp - (FRAME_SLOT_BCP << 3));
+#undef FRAME_SLOT_METHOD
+#undef FRAME_SLOT_BCP
+      } else {
+        meth = (Method*)sim->getCPUState().xreg(RMETHOD, 0);
+        bcp = (address)sim->getCPUState().xreg(RBCP, 0);
+      }
+      if (meth->is_native()) {
+        return;
+      }
+      if(method && meth->is_method()) {
+        ResourceMark rm;
+        method[0] = 'I';
+        method[1] = ' ';
+        meth->name_and_sig_as_C_string(method + 2, 398);
+      }
+      if (bcidx) {
+        if (meth->contains(bcp)) {
+          *bcidx = meth->bci_from(bcp);
+        } else {
+          *bcidx = -2;
+        }
+      }
+      if (decode) {
+        if (!BytecodeTracer::closure()) {
+          BytecodeTracer::set_closure(BytecodeTracer::std_closure());
+        }
+        stringStream str(decode, 400);
+        BytecodeTracer::trace(meth, bcp, &str);
+      }
+    } else {
+      if (method) {
+        CodeBlob *cb = CodeCache::find_blob((address)pc);
+        if (cb != NULL) {
+          if (cb->is_nmethod()) {
+            ResourceMark rm;
+            nmethod* nm = (nmethod*)cb;
+            method[0] = 'C';
+            method[1] = ' ';
+            nm->method()->name_and_sig_as_C_string(method + 2, 398);
+          } else if (cb->is_adapter_blob()) {
+            strcpy(method, "B adapter blob");
+          } else if (cb->is_runtime_stub()) {
+            strcpy(method, "B runtime stub");
+          } else if (cb->is_exception_stub()) {
+            strcpy(method, "B exception stub");
+          } else if (cb->is_deoptimization_stub()) {
+            strcpy(method, "B deoptimization stub");
+          } else if (cb->is_safepoint_stub()) {
+            strcpy(method, "B safepoint stub");
+          } else if (cb->is_uncommon_trap_stub()) {
+            strcpy(method, "B uncommon trap stub");
+          } else if (cb->contains((address)StubRoutines::call_stub())) {
+            strcpy(method, "B call stub");
+          } else {
+            strcpy(method, "B unknown blob : ");
+            strcat(method, cb->name());
+          }
+          if (framesize != NULL) {
+            *framesize = cb->frame_size();
+          }
+        }
+      }
+    }
+  }
+
+
+  JNIEXPORT void bccheck(u_int64_t pc, u_int64_t fp, char *method, int *bcidx, int *framesize, char *decode)
+  {
+    bccheck1(pc, fp, method, bcidx, framesize, decode);
+  }
+}
+
+#endif // BUILTIN_SIM
+#endif // !PRODUCT
+#endif // ! CC_INTERP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/templateInterpreter_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_TEMPLATEINTERPRETER_AARCH64_HPP
+#define CPU_AARCH64_VM_TEMPLATEINTERPRETER_AARCH64_HPP
+
+
+  protected:
+
+  // Size of interpreter code.  Increase if too small.  Interpreter will
+  // fail with a guarantee ("not enough space for interpreter generation");
+  // if too small.
+  // Run with +PrintInterpreter to get the VM to print out the size.
+  // Max size with JVMTI
+  const static int InterpreterCodeSize = 200 * 1024;
+
+#endif // CPU_AARCH64_VM_TEMPLATEINTERPRETER_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/templateTable_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,3697 @@
+/*
+ * Copyright (c) 2003, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "interpreter/interpreter.hpp"
+#include "interpreter/interpreterRuntime.hpp"
+#include "interpreter/interp_masm.hpp"
+#include "interpreter/templateTable.hpp"
+#include "memory/universe.inline.hpp"
+#include "oops/methodData.hpp"
+#include "oops/method.hpp"
+#include "oops/objArrayKlass.hpp"
+#include "oops/oop.inline.hpp"
+#include "prims/methodHandles.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "runtime/synchronizer.hpp"
+
+#ifndef CC_INTERP
+
+#define __ _masm->
+
+// Platform-dependent initialization
+
+void TemplateTable::pd_initialize() {
+  // No aarch64 specific initialization
+}
+
+// Address computation: local variables
+
+static inline Address iaddress(int n) {
+  return Address(rlocals, Interpreter::local_offset_in_bytes(n));
+}
+
+static inline Address laddress(int n) {
+  return iaddress(n + 1);
+}
+
+static inline Address faddress(int n) {
+  return iaddress(n);
+}
+
+static inline Address daddress(int n) {
+  return laddress(n);
+}
+
+static inline Address aaddress(int n) {
+  return iaddress(n);
+}
+
+static inline Address iaddress(Register r) {
+  return Address(rlocals, r, Address::lsl(3));
+}
+
+static inline Address laddress(Register r, Register scratch,
+                               InterpreterMacroAssembler* _masm) {
+  __ lea(scratch, Address(rlocals, r, Address::lsl(3)));
+  return Address(scratch, Interpreter::local_offset_in_bytes(1));
+}
+
+static inline Address faddress(Register r) {
+  return iaddress(r);
+}
+
+static inline Address daddress(Register r, Register scratch,
+                               InterpreterMacroAssembler* _masm) {
+  return laddress(r, scratch, _masm);
+}
+
+static inline Address aaddress(Register r) {
+  return iaddress(r);
+}
+
+static inline Address at_rsp() {
+  return Address(esp, 0);
+}
+
+// At top of Java expression stack which may be different than esp().  It
+// isn't for category 1 objects.
+static inline Address at_tos   () {
+  return Address(esp,  Interpreter::expr_offset_in_bytes(0));
+}
+
+static inline Address at_tos_p1() {
+  return Address(esp,  Interpreter::expr_offset_in_bytes(1));
+}
+
+static inline Address at_tos_p2() {
+  return Address(esp,  Interpreter::expr_offset_in_bytes(2));
+}
+
+static inline Address at_tos_p3() {
+  return Address(esp,  Interpreter::expr_offset_in_bytes(3));
+}
+
+static inline Address at_tos_p4() {
+  return Address(esp,  Interpreter::expr_offset_in_bytes(4));
+}
+
+static inline Address at_tos_p5() {
+  return Address(esp,  Interpreter::expr_offset_in_bytes(5));
+}
+
+// Condition conversion
+static Assembler::Condition j_not(TemplateTable::Condition cc) {
+  switch (cc) {
+  case TemplateTable::equal        : return Assembler::NE;
+  case TemplateTable::not_equal    : return Assembler::EQ;
+  case TemplateTable::less         : return Assembler::GE;
+  case TemplateTable::less_equal   : return Assembler::GT;
+  case TemplateTable::greater      : return Assembler::LE;
+  case TemplateTable::greater_equal: return Assembler::LT;
+  }
+  ShouldNotReachHere();
+  return Assembler::EQ;
+}
+
+
+// Miscelaneous helper routines
+// Store an oop (or NULL) at the Address described by obj.
+// If val == noreg this means store a NULL
+static void do_oop_store(InterpreterMacroAssembler* _masm,
+                         Address obj,
+                         Register val,
+                         BarrierSet::Name barrier,
+                         bool precise) {
+  assert(val == noreg || val == r0, "parameter is just for looks");
+  switch (barrier) {
+#if INCLUDE_ALL_GCS
+    case BarrierSet::G1SATBCT:
+    case BarrierSet::G1SATBCTLogging:
+      {
+        // flatten object address if needed
+        if (obj.index() == noreg && obj.offset() == 0) {
+          if (obj.base() != r3) {
+            __ mov(r3, obj.base());
+          }
+        } else {
+          __ lea(r3, obj);
+        }
+        __ g1_write_barrier_pre(r3 /* obj */,
+                                r1 /* pre_val */,
+                                rthread /* thread */,
+                                r10  /* tmp */,
+                                val != noreg /* tosca_live */,
+                                false /* expand_call */);
+        if (val == noreg) {
+          __ store_heap_oop_null(Address(r3, 0));
+        } else {
+          // G1 barrier needs uncompressed oop for region cross check.
+          Register new_val = val;
+          if (UseCompressedOops) {
+            new_val = rscratch1;
+            __ mov(new_val, val);
+          }
+          __ store_heap_oop(Address(r3, 0), val);
+          __ g1_write_barrier_post(r3 /* store_adr */,
+                                   new_val /* new_val */,
+                                   rthread /* thread */,
+                                   r10 /* tmp */,
+                                   r1 /* tmp2 */);
+        }
+
+      }
+      break;
+#endif // INCLUDE_ALL_GCS
+    case BarrierSet::CardTableModRef:
+    case BarrierSet::CardTableExtension:
+      {
+        if (val == noreg) {
+          __ store_heap_oop_null(obj);
+        } else {
+          __ store_heap_oop(obj, val);
+          // flatten object address if needed
+          if (!precise || (obj.index() == noreg && obj.offset() == 0)) {
+            __ store_check(obj.base());
+          } else {
+            __ lea(r3, obj);
+            __ store_check(r3);
+          }
+        }
+      }
+      break;
+    case BarrierSet::ModRef:
+    case BarrierSet::Other:
+      if (val == noreg) {
+        __ store_heap_oop_null(obj);
+      } else {
+        __ store_heap_oop(obj, val);
+      }
+      break;
+    default      :
+      ShouldNotReachHere();
+
+  }
+}
+
+Address TemplateTable::at_bcp(int offset) {
+  assert(_desc->uses_bcp(), "inconsistent uses_bcp information");
+  return Address(rbcp, offset);
+}
+
+void TemplateTable::patch_bytecode(Bytecodes::Code bc, Register bc_reg,
+                                   Register temp_reg, bool load_bc_into_bc_reg/*=true*/,
+                                   int byte_no)
+{
+  if (!RewriteBytecodes)  return;
+  Label L_patch_done;
+
+  switch (bc) {
+  case Bytecodes::_fast_aputfield:
+  case Bytecodes::_fast_bputfield:
+  case Bytecodes::_fast_cputfield:
+  case Bytecodes::_fast_dputfield:
+  case Bytecodes::_fast_fputfield:
+  case Bytecodes::_fast_iputfield:
+  case Bytecodes::_fast_lputfield:
+  case Bytecodes::_fast_sputfield:
+    {
+      // We skip bytecode quickening for putfield instructions when
+      // the put_code written to the constant pool cache is zero.
+      // This is required so that every execution of this instruction
+      // calls out to InterpreterRuntime::resolve_get_put to do
+      // additional, required work.
+      assert(byte_no == f1_byte || byte_no == f2_byte, "byte_no out of range");
+      assert(load_bc_into_bc_reg, "we use bc_reg as temp");
+      __ get_cache_and_index_and_bytecode_at_bcp(temp_reg, bc_reg, temp_reg, byte_no, 1);
+      __ movw(bc_reg, bc);
+      __ cmpw(temp_reg, (unsigned) 0);
+      __ br(Assembler::EQ, L_patch_done);  // don't patch
+    }
+    break;
+  default:
+    assert(byte_no == -1, "sanity");
+    // the pair bytecodes have already done the load.
+    if (load_bc_into_bc_reg) {
+      __ movw(bc_reg, bc);
+    }
+  }
+
+  if (JvmtiExport::can_post_breakpoint()) {
+    Label L_fast_patch;
+    // if a breakpoint is present we can't rewrite the stream directly
+    __ load_unsigned_byte(temp_reg, at_bcp(0));
+    __ cmpw(temp_reg, Bytecodes::_breakpoint);
+    __ br(Assembler::NE, L_fast_patch);
+    // Let breakpoint table handling rewrite to quicker bytecode
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::set_original_bytecode_at), rmethod, rbcp, bc_reg);
+    __ b(L_patch_done);
+    __ bind(L_fast_patch);
+  }
+
+#ifdef ASSERT
+  Label L_okay;
+  __ load_unsigned_byte(temp_reg, at_bcp(0));
+  __ cmpw(temp_reg, (int) Bytecodes::java_code(bc));
+  __ br(Assembler::EQ, L_okay);
+  __ cmpw(temp_reg, bc_reg);
+  __ br(Assembler::EQ, L_okay);
+  __ stop("patching the wrong bytecode");
+  __ bind(L_okay);
+#endif
+
+  // patch bytecode
+  __ strb(bc_reg, at_bcp(0));
+  __ bind(L_patch_done);
+}
+
+
+// Individual instructions
+
+void TemplateTable::nop() {
+  transition(vtos, vtos);
+  // nothing to do
+}
+
+void TemplateTable::shouldnotreachhere() {
+  transition(vtos, vtos);
+  __ stop("shouldnotreachhere bytecode");
+}
+
+void TemplateTable::aconst_null()
+{
+  transition(vtos, atos);
+  __ mov(r0, 0);
+}
+
+void TemplateTable::iconst(int value)
+{
+  transition(vtos, itos);
+  __ mov(r0, value);
+}
+
+void TemplateTable::lconst(int value)
+{
+  __ mov(r0, value);
+}
+
+void TemplateTable::fconst(int value)
+{
+  transition(vtos, ftos);
+  switch (value) {
+  case 0:
+    __ fmovs(v0, zr);
+    break;
+  case 1:
+    __ fmovs(v0, 1.0);
+    break;
+  case 2:
+    __ fmovs(v0, 2.0);
+    break;
+  default:
+    ShouldNotReachHere();
+    break;
+  }
+}
+
+void TemplateTable::dconst(int value)
+{
+  transition(vtos, dtos);
+  switch (value) {
+  case 0:
+    __ fmovd(v0, zr);
+    break;
+  case 1:
+    __ fmovd(v0, 1.0);
+    break;
+  case 2:
+    __ fmovd(v0, 2.0);
+    break;
+  default:
+    ShouldNotReachHere();
+    break;
+  }
+}
+
+void TemplateTable::bipush()
+{
+  transition(vtos, itos);
+  __ load_signed_byte32(r0, at_bcp(1));
+}
+
+void TemplateTable::sipush()
+{
+  transition(vtos, itos);
+  __ load_unsigned_short(r0, at_bcp(1));
+  __ revw(r0, r0);
+  __ asrw(r0, r0, 16);
+}
+
+void TemplateTable::ldc(bool wide)
+{
+  transition(vtos, vtos);
+  Label call_ldc, notFloat, notClass, Done;
+
+  if (wide) {
+    __ get_unsigned_2_byte_index_at_bcp(r1, 1);
+  } else {
+    __ load_unsigned_byte(r1, at_bcp(1));
+  }
+  __ get_cpool_and_tags(r2, r0);
+
+  const int base_offset = ConstantPool::header_size() * wordSize;
+  const int tags_offset = Array<u1>::base_offset_in_bytes();
+
+  // get type
+  __ add(r3, r1, tags_offset);
+  __ ldrb(r3, Address(r0, r3));
+
+  // unresolved class - get the resolved class
+  __ cmp(r3, JVM_CONSTANT_UnresolvedClass);
+  __ br(Assembler::EQ, call_ldc);
+
+  // unresolved class in error state - call into runtime to throw the error
+  // from the first resolution attempt
+  __ cmp(r3, JVM_CONSTANT_UnresolvedClassInError);
+  __ br(Assembler::EQ, call_ldc);
+
+  // resolved class - need to call vm to get java mirror of the class
+  __ cmp(r3, JVM_CONSTANT_Class);
+  __ br(Assembler::NE, notClass);
+
+  __ bind(call_ldc);
+  __ mov(c_rarg1, wide);
+  call_VM(r0, CAST_FROM_FN_PTR(address, InterpreterRuntime::ldc), c_rarg1);
+  __ push_ptr(r0);
+  __ verify_oop(r0);
+  __ b(Done);
+
+  __ bind(notClass);
+  __ cmp(r3, JVM_CONSTANT_Float);
+  __ br(Assembler::NE, notFloat);
+  // ftos
+  __ adds(r1, r2, r1, Assembler::LSL, 3);
+  __ ldrs(v0, Address(r1, base_offset));
+  __ push_f();
+  __ b(Done);
+
+  __ bind(notFloat);
+#ifdef ASSERT
+  {
+    Label L;
+    __ cmp(r3, JVM_CONSTANT_Integer);
+    __ br(Assembler::EQ, L);
+    // String and Object are rewritten to fast_aldc
+    __ stop("unexpected tag type in ldc");
+    __ bind(L);
+  }
+#endif
+  // itos JVM_CONSTANT_Integer only
+  __ adds(r1, r2, r1, Assembler::LSL, 3);
+  __ ldrw(r0, Address(r1, base_offset));
+  __ push_i(r0);
+  __ bind(Done);
+}
+
+// Fast path for caching oop constants.
+void TemplateTable::fast_aldc(bool wide)
+{
+  transition(vtos, atos);
+
+  Register result = r0;
+  Register tmp = r1;
+  int index_size = wide ? sizeof(u2) : sizeof(u1);
+
+  Label resolved;
+
+  // We are resolved if the resolved reference cache entry contains a
+  // non-null object (String, MethodType, etc.)
+  assert_different_registers(result, tmp);
+  __ get_cache_index_at_bcp(tmp, 1, index_size);
+  __ load_resolved_reference_at_index(result, tmp);
+  __ cbnz(result, resolved);
+
+  address entry = CAST_FROM_FN_PTR(address, InterpreterRuntime::resolve_ldc);
+
+  // first time invocation - must resolve first
+  __ mov(tmp, (int)bytecode());
+  __ call_VM(result, entry, tmp);
+
+  __ bind(resolved);
+
+  if (VerifyOops) {
+    __ verify_oop(result);
+  }
+}
+
+void TemplateTable::ldc2_w()
+{
+  transition(vtos, vtos);
+  Label Long, Done;
+  __ get_unsigned_2_byte_index_at_bcp(r0, 1);
+
+  __ get_cpool_and_tags(r1, r2);
+  const int base_offset = ConstantPool::header_size() * wordSize;
+  const int tags_offset = Array<u1>::base_offset_in_bytes();
+
+  // get type
+  __ lea(r2, Address(r2, r0, Address::lsl(0)));
+  __ load_unsigned_byte(r2, Address(r2, tags_offset));
+  __ cmpw(r2, (int)JVM_CONSTANT_Double);
+  __ br(Assembler::NE, Long);
+  // dtos
+  __ lea (r2, Address(r1, r0, Address::lsl(3)));
+  __ ldrd(v0, Address(r2, base_offset));
+  __ push_d();
+  __ b(Done);
+
+  __ bind(Long);
+  // ltos
+  __ lea(r0, Address(r1, r0, Address::lsl(3)));
+  __ ldr(r0, Address(r0, base_offset));
+  __ push_l();
+
+  __ bind(Done);
+}
+
+void TemplateTable::locals_index(Register reg, int offset)
+{
+  __ ldrb(reg, at_bcp(offset));
+  __ neg(reg, reg);
+}
+
+void TemplateTable::iload()
+{
+  transition(vtos, itos);
+  if (RewriteFrequentPairs) {
+    // TODO : check x86 code for what to do here
+    __ call_Unimplemented();
+  } else {
+    locals_index(r1);
+    __ ldr(r0, iaddress(r1));
+  }
+
+}
+
+void TemplateTable::fast_iload2()
+{
+  __ call_Unimplemented();
+}
+
+void TemplateTable::fast_iload()
+{
+  __ call_Unimplemented();
+}
+
+void TemplateTable::lload()
+{
+  transition(vtos, ltos);
+  __ ldrb(r1, at_bcp(1));
+  __ sub(r1, rlocals, r1, ext::uxtw, LogBytesPerWord);
+  __ ldr(r0, Address(r1, Interpreter::local_offset_in_bytes(1)));
+}
+
+void TemplateTable::fload()
+{
+  transition(vtos, ftos);
+  locals_index(r1);
+  // n.b. we use ldrd here because this is a 64 bit slot
+  // this is comparable to the iload case
+  __ ldrd(v0, faddress(r1));
+}
+
+void TemplateTable::dload()
+{
+  transition(vtos, dtos);
+  __ ldrb(r1, at_bcp(1));
+  __ sub(r1, rlocals, r1, ext::uxtw, LogBytesPerWord);
+  __ ldrd(v0, Address(r1, Interpreter::local_offset_in_bytes(1)));
+}
+
+void TemplateTable::aload()
+{
+  transition(vtos, atos);
+  locals_index(r1);
+  __ ldr(r0, iaddress(r1));
+}
+
+void TemplateTable::locals_index_wide(Register reg) {
+  __ ldrh(reg, at_bcp(2));
+  __ rev16w(reg, reg);
+  __ neg(reg, reg);
+}
+
+void TemplateTable::wide_iload() {
+  transition(vtos, itos);
+  locals_index_wide(r1);
+  __ ldr(r0, iaddress(r1));
+}
+
+void TemplateTable::wide_lload()
+{
+  transition(vtos, ltos);
+  __ ldrh(r1, at_bcp(2));
+  __ rev16w(r1, r1);
+  __ sub(r1, rlocals, r1, ext::uxtw, LogBytesPerWord);
+  __ ldr(r0, Address(r1, Interpreter::local_offset_in_bytes(1)));
+}
+
+void TemplateTable::wide_fload()
+{
+  transition(vtos, ftos);
+  locals_index_wide(r1);
+  // n.b. we use ldrd here because this is a 64 bit slot
+  // this is comparable to the iload case
+  __ ldrd(v0, faddress(r1));
+}
+
+void TemplateTable::wide_dload()
+{
+  transition(vtos, dtos);
+  __ ldrh(r1, at_bcp(2));
+  __ rev16w(r1, r1);
+  __ sub(r1, rlocals, r1, ext::uxtw, LogBytesPerWord);
+  __ ldrd(v0, Address(r1, Interpreter::local_offset_in_bytes(1)));
+}
+
+void TemplateTable::wide_aload()
+{
+  transition(vtos, atos);
+  locals_index_wide(r1);
+  __ ldr(r0, aaddress(r1));
+}
+
+void TemplateTable::index_check(Register array, Register index)
+{
+  // destroys r1, rscratch1
+  // check array
+  __ null_check(array, arrayOopDesc::length_offset_in_bytes());
+  // sign extend index for use by indexed load
+  // __ movl2ptr(index, index);
+  // check index
+  Register length = rscratch1;
+  __ ldrw(length, Address(array, arrayOopDesc::length_offset_in_bytes()));
+  __ cmpw(index, length);
+  if (index != r1) {
+    // ??? convention: move aberrant index into r1 for exception message
+    assert(r1 != array, "different registers");
+    __ mov(r1, index);
+  }
+  Label ok;
+  __ br(Assembler::LO, ok);
+  __ mov(rscratch1, Interpreter::_throw_ArrayIndexOutOfBoundsException_entry);
+  __ br(rscratch1);
+  __ bind(ok);
+}
+
+void TemplateTable::iaload()
+{
+  transition(itos, itos);
+  __ mov(r1, r0);
+  __ pop_ptr(r0);
+  // r0: array
+  // r1: index
+  index_check(r0, r1); // leaves index in r1, kills rscratch1
+  __ lea(r1, Address(r0, r1, Address::uxtw(2)));
+  __ ldrw(r0, Address(r1, arrayOopDesc::base_offset_in_bytes(T_INT)));
+}
+
+void TemplateTable::laload()
+{
+  transition(itos, ltos);
+  __ mov(r1, r0);
+  __ pop_ptr(r0);
+  // r0: array
+  // r1: index
+  index_check(r0, r1); // leaves index in r1, kills rscratch1
+  __ lea(r1, Address(r0, r1, Address::uxtw(3)));
+  __ ldr(r0, Address(r1,  arrayOopDesc::base_offset_in_bytes(T_LONG)));
+}
+
+void TemplateTable::faload()
+{
+  transition(itos, ftos);
+  __ mov(r1, r0);
+  __ pop_ptr(r0);
+  // r0: array
+  // r1: index
+  index_check(r0, r1); // leaves index in r1, kills rscratch1
+  __ lea(r1,  Address(r0, r1, Address::uxtw(2)));
+  __ ldrs(v0, Address(r1,  arrayOopDesc::base_offset_in_bytes(T_FLOAT)));
+}
+
+void TemplateTable::daload()
+{
+  transition(itos, dtos);
+  __ mov(r1, r0);
+  __ pop_ptr(r0);
+  // r0: array
+  // r1: index
+  index_check(r0, r1); // leaves index in r1, kills rscratch1
+  __ lea(r1,  Address(r0, r1, Address::uxtw(3)));
+  __ ldrd(v0, Address(r1,  arrayOopDesc::base_offset_in_bytes(T_DOUBLE)));
+}
+
+void TemplateTable::aaload()
+{
+  transition(itos, atos);
+  __ mov(r1, r0);
+  __ pop_ptr(r0);
+  // r0: array
+  // r1: index
+  index_check(r0, r1); // leaves index in r1, kills rscratch1
+  int s = (UseCompressedOops ? 2 : 3);
+  __ lea(r1, Address(r0, r1, Address::uxtw(s)));
+  __ load_heap_oop(r0, Address(r1, arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
+}
+
+void TemplateTable::baload()
+{
+  transition(itos, itos);
+  __ mov(r1, r0);
+  __ pop_ptr(r0);
+  // r0: array
+  // r1: index
+  index_check(r0, r1); // leaves index in r1, kills rscratch1
+  __ lea(r1,  Address(r0, r1, Address::uxtw(0)));
+  __ load_signed_byte(r0, Address(r1,  arrayOopDesc::base_offset_in_bytes(T_BYTE)));
+}
+
+void TemplateTable::caload()
+{
+  transition(itos, itos);
+  __ mov(r1, r0);
+  __ pop_ptr(r0);
+  // r0: array
+  // r1: index
+  index_check(r0, r1); // leaves index in r1, kills rscratch1
+  __ lea(r1,  Address(r0, r1, Address::uxtw(1)));
+  __ load_unsigned_short(r0, Address(r1,  arrayOopDesc::base_offset_in_bytes(T_CHAR)));
+}
+
+// iload followed by caload frequent pair
+void TemplateTable::fast_icaload()
+{
+  __ call_Unimplemented();
+}
+
+void TemplateTable::saload()
+{
+  transition(itos, itos);
+  __ mov(r1, r0);
+  __ pop_ptr(r0);
+  // r0: array
+  // r1: index
+  index_check(r0, r1); // leaves index in r1, kills rscratch1
+  __ lea(r1,  Address(r0, r1, Address::uxtw(1)));
+  __ load_signed_short(r0, Address(r1,  arrayOopDesc::base_offset_in_bytes(T_SHORT)));
+}
+
+void TemplateTable::iload(int n)
+{
+  transition(vtos, itos);
+  __ ldr(r0, iaddress(n));
+}
+
+void TemplateTable::lload(int n)
+{
+  transition(vtos, ltos);
+  __ ldr(r0, laddress(n));
+}
+
+void TemplateTable::fload(int n)
+{
+  transition(vtos, ftos);
+  __ ldrs(v0, faddress(n));
+}
+
+void TemplateTable::dload(int n)
+{
+  transition(vtos, dtos);
+  __ ldrd(v0, daddress(n));
+}
+
+void TemplateTable::aload(int n)
+{
+  transition(vtos, atos);
+  __ ldr(r0, iaddress(n));
+}
+
+void TemplateTable::aload_0()
+{
+  // According to bytecode histograms, the pairs:
+  //
+  // _aload_0, _fast_igetfield
+  // _aload_0, _fast_agetfield
+  // _aload_0, _fast_fgetfield
+  //
+  // occur frequently. If RewriteFrequentPairs is set, the (slow)
+  // _aload_0 bytecode checks if the next bytecode is either
+  // _fast_igetfield, _fast_agetfield or _fast_fgetfield and then
+  // rewrites the current bytecode into a pair bytecode; otherwise it
+  // rewrites the current bytecode into _fast_aload_0 that doesn't do
+  // the pair check anymore.
+  //
+  // Note: If the next bytecode is _getfield, the rewrite must be
+  //       delayed, otherwise we may miss an opportunity for a pair.
+  //
+  // Also rewrite frequent pairs
+  //   aload_0, aload_1
+  //   aload_0, iload_1
+  // These bytecodes with a small amount of code are most profitable
+  // to rewrite
+  if (RewriteFrequentPairs) {
+    __ call_Unimplemented();
+  } else {
+    aload(0);
+  }
+}
+
+void TemplateTable::istore()
+{
+  transition(itos, vtos);
+  locals_index(r1);
+  // FIXME: We're being very pernickerty here storing a jint in a
+  // local with strw, which costs an extra instruction over what we'd
+  // be able to do with a simple str.  We should just store the whole
+  // word.
+  __ lea(rscratch1, iaddress(r1));
+  __ strw(r0, Address(rscratch1));
+}
+
+void TemplateTable::lstore()
+{
+  transition(ltos, vtos);
+  locals_index(r1);
+  __ str(r0, laddress(r1, rscratch1, _masm));
+}
+
+void TemplateTable::fstore() {
+  transition(ftos, vtos);
+  locals_index(r1);
+  __ lea(rscratch1, iaddress(r1));
+  __ strs(v0, Address(rscratch1));
+}
+
+void TemplateTable::dstore() {
+  transition(dtos, vtos);
+  locals_index(r1);
+  __ strd(v0, daddress(r1, rscratch1, _masm));
+}
+
+void TemplateTable::astore()
+{
+  transition(vtos, vtos);
+  __ pop_ptr(r0);
+  locals_index(r1);
+  __ str(r0, aaddress(r1));
+}
+
+void TemplateTable::wide_istore() {
+  transition(vtos, vtos);
+  __ pop_i();
+  locals_index_wide(r1);
+  __ lea(rscratch1, iaddress(r1));
+  __ strw(r0, Address(rscratch1));
+}
+
+void TemplateTable::wide_lstore() {
+  transition(vtos, vtos);
+  __ pop_l();
+  locals_index_wide(r1);
+  __ str(r0, laddress(r1, rscratch1, _masm));
+}
+
+void TemplateTable::wide_fstore() {
+  transition(vtos, vtos);
+  __ pop_f();
+  locals_index_wide(r1);
+  __ lea(rscratch1, faddress(r1));
+  __ strs(v0, rscratch1);
+}
+
+void TemplateTable::wide_dstore() {
+  transition(vtos, vtos);
+  __ pop_d();
+  locals_index_wide(r1);
+  __ strd(v0, daddress(r1, rscratch1, _masm));
+}
+
+void TemplateTable::wide_astore() {
+  transition(vtos, vtos);
+  __ pop_ptr(r0);
+  locals_index_wide(r1);
+  __ str(r0, aaddress(r1));
+}
+
+void TemplateTable::iastore() {
+  transition(itos, vtos);
+  __ pop_i(r1);
+  __ pop_ptr(r3);
+  // r0: value
+  // r1: index
+  // r3: array
+  index_check(r3, r1); // prefer index in r1
+  __ lea(rscratch1, Address(r3, r1, Address::uxtw(2)));
+  __ strw(r0, Address(rscratch1,
+                      arrayOopDesc::base_offset_in_bytes(T_INT)));
+}
+
+void TemplateTable::lastore() {
+  transition(ltos, vtos);
+  __ pop_i(r1);
+  __ pop_ptr(r3);
+  // r0: value
+  // r1: index
+  // r3: array
+  index_check(r3, r1); // prefer index in r1
+  __ lea(rscratch1, Address(r3, r1, Address::uxtw(3)));
+  __ str(r0, Address(rscratch1,
+                      arrayOopDesc::base_offset_in_bytes(T_LONG)));
+}
+
+void TemplateTable::fastore() {
+  transition(ftos, vtos);
+  __ pop_i(r1);
+  __ pop_ptr(r3);
+  // v0: value
+  // r1:  index
+  // r3:  array
+  index_check(r3, r1); // prefer index in r1
+  __ lea(rscratch1, Address(r3, r1, Address::uxtw(2)));
+  __ strs(v0, Address(rscratch1,
+                      arrayOopDesc::base_offset_in_bytes(T_FLOAT)));
+}
+
+void TemplateTable::dastore() {
+  transition(dtos, vtos);
+  __ pop_i(r1);
+  __ pop_ptr(r3);
+  // v0: value
+  // r1:  index
+  // r3:  array
+  index_check(r3, r1); // prefer index in r1
+  __ lea(rscratch1, Address(r3, r1, Address::uxtw(3)));
+  __ strd(v0, Address(rscratch1,
+                      arrayOopDesc::base_offset_in_bytes(T_DOUBLE)));
+}
+
+void TemplateTable::aastore() {
+  Label is_null, ok_is_subtype, done;
+  transition(vtos, vtos);
+  // stack: ..., array, index, value
+  __ ldr(r0, at_tos());    // value
+  __ ldr(r2, at_tos_p1()); // index
+  __ ldr(r3, at_tos_p2()); // array
+
+  Address element_address(r4, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
+
+  index_check(r3, r2);     // kills r1
+  __ lea(r4, Address(r3, r2, Address::uxtw(UseCompressedOops? 2 : 3)));
+
+  // do array store check - check for NULL value first
+  __ cbz(r0, is_null);
+
+  // Move subklass into r1
+  __ load_klass(r1, r0);
+  // Move superklass into r0
+  __ load_klass(r0, r3);
+  __ ldr(r0, Address(r0,
+                     ObjArrayKlass::element_klass_offset()));
+  // Compress array + index*oopSize + 12 into a single register.  Frees r2.
+
+  // Generate subtype check.  Blows r2, r5
+  // Superklass in r0.  Subklass in r1.
+  __ gen_subtype_check(r1, ok_is_subtype);
+
+  // Come here on failure
+  // object is at TOS
+  __ b(Interpreter::_throw_ArrayStoreException_entry);
+
+  // Come here on success
+  __ bind(ok_is_subtype);
+
+  // Get the value we will store
+  __ ldr(r0, at_tos());
+  // Now store using the appropriate barrier
+  do_oop_store(_masm, element_address, r0, _bs->kind(), true);
+  __ b(done);
+
+  // Have a NULL in r0, r3=array, r2=index.  Store NULL at ary[idx]
+  __ bind(is_null);
+  __ profile_null_seen(r2);
+
+  // Store a NULL
+  do_oop_store(_masm, element_address, noreg, _bs->kind(), true);
+
+  // Pop stack arguments
+  __ bind(done);
+  __ add(esp, esp, 3 * Interpreter::stackElementSize);
+}
+
+void TemplateTable::bastore()
+{
+  transition(itos, vtos);
+  __ pop_i(r1);
+  __ pop_ptr(r3);
+  // r0: value
+  // r1: index
+  // r3: array
+  index_check(r3, r1); // prefer index in r1
+  __ lea(rscratch1, Address(r3, r1, Address::uxtw(0)));
+  __ strb(r0, Address(rscratch1,
+                      arrayOopDesc::base_offset_in_bytes(T_BYTE)));
+}
+
+void TemplateTable::castore()
+{
+  transition(itos, vtos);
+  __ pop_i(r1);
+  __ pop_ptr(r3);
+  // r0: value
+  // r1: index
+  // r3: array
+  index_check(r3, r1); // prefer index in r1
+  __ lea(rscratch1, Address(r3, r1, Address::uxtw(1)));
+  __ strh(r0, Address(rscratch1,
+                      arrayOopDesc::base_offset_in_bytes(T_CHAR)));
+}
+
+void TemplateTable::sastore()
+{
+  castore();
+}
+
+void TemplateTable::istore(int n)
+{
+  transition(itos, vtos);
+  __ str(r0, iaddress(n));
+}
+
+void TemplateTable::lstore(int n)
+{
+  transition(ltos, vtos);
+  __ str(r0, laddress(n));
+}
+
+void TemplateTable::fstore(int n)
+{
+  transition(ftos, vtos);
+  __ strs(v0, faddress(n));
+}
+
+void TemplateTable::dstore(int n)
+{
+  transition(dtos, vtos);
+  __ strd(v0, daddress(n));
+}
+
+void TemplateTable::astore(int n)
+{
+  transition(vtos, vtos);
+  __ pop_ptr(r0);
+  __ str(r0, iaddress(n));
+}
+
+void TemplateTable::pop()
+{
+  transition(vtos, vtos);
+  __ add(esp, esp, Interpreter::stackElementSize);
+}
+
+void TemplateTable::pop2()
+{
+  transition(vtos, vtos);
+  __ add(esp, esp, 2 * Interpreter::stackElementSize);
+}
+
+void TemplateTable::dup()
+{
+  transition(vtos, vtos);
+  __ ldr(r0, Address(esp, 0));
+  __ push(r0);
+  // stack: ..., a, a
+}
+
+void TemplateTable::dup_x1()
+{
+  transition(vtos, vtos);
+  // stack: ..., a, b
+  __ ldr(r0, at_tos());  // load b
+  __ ldr(r2, at_tos_p1());  // load a
+  __ str(r0, at_tos_p1());  // store b
+  __ str(r2, at_tos());  // store a
+  __ push(r0);                  // push b
+  // stack: ..., b, a, b
+}
+
+void TemplateTable::dup_x2()
+{
+  transition(vtos, vtos);
+  // stack: ..., a, b, c
+  __ ldr(r0, at_tos());  // load c
+  __ ldr(r2, at_tos_p2());  // load a
+  __ str(r0, at_tos_p2());  // store c in a
+  __ push(r0);      // push c
+  // stack: ..., c, b, c, c
+  __ ldr(r0, at_tos_p2());  // load b
+  __ str(r2, at_tos_p2());  // store a in b
+  // stack: ..., c, a, c, c
+  __ str(r0, at_tos_p1());  // store b in c
+  // stack: ..., c, a, b, c
+}
+
+void TemplateTable::dup2()
+{
+  transition(vtos, vtos);
+  // stack: ..., a, b
+  __ ldr(r0, at_tos_p1());  // load a
+  __ push(r0);                  // push a
+  __ ldr(r0, at_tos_p1());  // load b
+  __ push(r0);                  // push b
+  // stack: ..., a, b, a, b
+}
+
+void TemplateTable::dup2_x1()
+{
+  transition(vtos, vtos);
+  // stack: ..., a, b, c
+  __ ldr(r2, at_tos());  // load c
+  __ ldr(r0, at_tos_p1());  // load b
+  __ push(r0);                  // push b
+  __ push(r2);                  // push c
+  // stack: ..., a, b, c, b, c
+  __ str(r2, at_tos_p3());  // store c in b
+  // stack: ..., a, c, c, b, c
+  __ ldr(r2, at_tos_p4());  // load a
+  __ str(r2, at_tos_p2());  // store a in 2nd c
+  // stack: ..., a, c, a, b, c
+  __ str(r0, at_tos_p4());  // store b in a
+  // stack: ..., b, c, a, b, c
+}
+
+void TemplateTable::dup2_x2()
+{
+  transition(vtos, vtos);
+  // stack: ..., a, b, c, d
+  __ ldr(r2, at_tos());  // load d
+  __ ldr(r0, at_tos_p1());  // load c
+  __ push(r0)            ;      // push c
+  __ push(r2);                  // push d
+  // stack: ..., a, b, c, d, c, d
+  __ ldr(r0, at_tos_p4());  // load b
+  __ str(r0, at_tos_p2());  // store b in d
+  __ str(r2, at_tos_p4());  // store d in b
+  // stack: ..., a, d, c, b, c, d
+  __ ldr(r2, at_tos_p5());  // load a
+  __ ldr(r0, at_tos_p3());  // load c
+  __ str(r2, at_tos_p3());  // store a in c
+  __ str(r0, at_tos_p5());  // store c in a
+  // stack: ..., c, d, a, b, c, d
+}
+
+void TemplateTable::swap()
+{
+  transition(vtos, vtos);
+  // stack: ..., a, b
+  __ ldr(r2, at_tos_p1());  // load a
+  __ ldr(r0, at_tos());  // load b
+  __ str(r2, at_tos());  // store a in b
+  __ str(r0, at_tos_p1());  // store b in a
+  // stack: ..., b, a
+}
+
+void TemplateTable::iop2(Operation op)
+{
+  transition(itos, itos);
+  // r0 <== r1 op r0
+  __ pop_i(r1);
+  switch (op) {
+  case add  : __ addw(r0, r1, r0); break;
+  case sub  : __ subw(r0, r1, r0); break;
+  case mul  : __ mulw(r0, r1, r0); break;
+  case _and : __ andw(r0, r1, r0); break;
+  case _or  : __ orrw(r0, r1, r0); break;
+  case _xor : __ eorw(r0, r1, r0); break;
+  case shl  : __ lslvw(r0, r1, r0); break;
+  case shr  : __ asrvw(r0, r1, r0); break;
+  case ushr : __ lsrvw(r0, r1, r0);break;
+  default   : ShouldNotReachHere();
+  }
+}
+
+void TemplateTable::lop2(Operation op)
+{
+  transition(ltos, ltos);
+  // r0 <== r1 op r0
+  __ pop_l(r1);
+  switch (op) {
+  case add  : __ add(r0, r1, r0); break;
+  case sub  : __ sub(r0, r1, r0); break;
+  case mul  : __ mul(r0, r1, r0); break;
+  case _and : __ andr(r0, r1, r0); break;
+  case _or  : __ orr(r0, r1, r0); break;
+  case _xor : __ eor(r0, r1, r0); break;
+  default   : ShouldNotReachHere();
+  }
+}
+
+void TemplateTable::idiv()
+{
+  transition(itos, itos);
+  // explicitly check for div0
+  Label no_div0;
+  __ cbnzw(r0, no_div0);
+  __ mov(rscratch1, Interpreter::_throw_ArithmeticException_entry);
+  __ br(rscratch1);
+  __ bind(no_div0);
+  __ pop_i(r1);
+  // r0 <== r1 idiv r0
+  __ corrected_idivl(r0, r1, r0, /* want_remainder */ false);
+}
+
+void TemplateTable::irem()
+{
+  transition(itos, itos);
+  // explicitly check for div0
+  Label no_div0;
+  __ cbnzw(r0, no_div0);
+  __ mov(rscratch1, Interpreter::_throw_ArithmeticException_entry);
+  __ br(rscratch1);
+  __ bind(no_div0);
+  __ pop_i(r1);
+  // r0 <== r1 irem r0
+  __ corrected_idivl(r0, r1, r0, /* want_remainder */ true);
+}
+
+void TemplateTable::lmul()
+{
+  transition(ltos, ltos);
+  __ pop_l(r1);
+  __ mul(r0, r0, r1);
+}
+
+void TemplateTable::ldiv()
+{
+  transition(ltos, ltos);
+  // explicitly check for div0
+  Label no_div0;
+  __ cbnz(r0, no_div0);
+  __ mov(rscratch1, Interpreter::_throw_ArithmeticException_entry);
+  __ br(rscratch1);
+  __ bind(no_div0);
+  __ pop_l(r1);
+  // r0 <== r1 ldiv r0
+  __ corrected_idivq(r0, r1, r0, /* want_remainder */ false);
+}
+
+void TemplateTable::lrem()
+{
+  transition(ltos, ltos);
+  // explicitly check for div0
+  Label no_div0;
+  __ cbnz(r0, no_div0);
+  __ mov(rscratch1, Interpreter::_throw_ArithmeticException_entry);
+  __ br(rscratch1);
+  __ bind(no_div0);
+  __ pop_l(r1);
+  // r0 <== r1 lrem r0
+  __ corrected_idivq(r0, r1, r0, /* want_remainder */ true);
+}
+
+void TemplateTable::lshl()
+{
+  transition(itos, ltos);
+  // shift count is in r0
+  __ pop_l(r1);
+  __ lslv(r0, r1, r0);
+}
+
+void TemplateTable::lshr()
+{
+  transition(itos, ltos);
+  // shift count is in r0
+  __ pop_l(r1);
+  __ asrv(r0, r1, r0);
+}
+
+void TemplateTable::lushr()
+{
+  transition(itos, ltos);
+  // shift count is in r0
+  __ pop_l(r1);
+  __ lsrv(r0, r1, r0);
+}
+
+void TemplateTable::fop2(Operation op)
+{
+  transition(ftos, ftos);
+  switch (op) {
+  case add:
+    // n.b. use ldrd because this is a 64 bit slot
+    __ pop_f(v1);
+    __ fadds(v0, v1, v0);
+    break;
+  case sub:
+    __ pop_f(v1);
+    __ fsubs(v0, v1, v0);
+    break;
+  case mul:
+    __ pop_f(v1);
+    __ fmuls(v0, v1, v0);
+    break;
+  case div:
+    __ pop_f(v1);
+    __ fdivs(v0, v1, v0);
+    break;
+  case rem:
+    __ fmovs(v1, v0);
+    __ pop_f(v0);
+    __ call_VM_leaf_base1(CAST_FROM_FN_PTR(address, SharedRuntime::frem),
+                         0, 2, MacroAssembler::ret_type_float);
+    break;
+  default:
+    ShouldNotReachHere();
+    break;
+  }
+}
+
+void TemplateTable::dop2(Operation op)
+{
+  transition(dtos, dtos);
+  switch (op) {
+  case add:
+    // n.b. use ldrd because this is a 64 bit slot
+    __ pop_d(v1);
+    __ faddd(v0, v1, v0);
+    break;
+  case sub:
+    __ pop_d(v1);
+    __ fsubd(v0, v1, v0);
+    break;
+  case mul:
+    __ pop_d(v1);
+    __ fmuld(v0, v1, v0);
+    break;
+  case div:
+    __ pop_d(v1);
+    __ fdivd(v0, v1, v0);
+    break;
+  case rem:
+    __ fmovd(v1, v0);
+    __ pop_d(v0);
+    __ call_VM_leaf_base1(CAST_FROM_FN_PTR(address, SharedRuntime::drem),
+                         0, 2, MacroAssembler::ret_type_double);
+    break;
+  default:
+    ShouldNotReachHere();
+    break;
+  }
+}
+
+void TemplateTable::ineg()
+{
+  transition(itos, itos);
+  __ negw(r0, r0);
+
+}
+
+void TemplateTable::lneg()
+{
+  transition(ltos, ltos);
+  __ neg(r0, r0);
+}
+
+void TemplateTable::fneg()
+{
+  transition(ftos, ftos);
+  __ fnegs(v0, v0);
+}
+
+void TemplateTable::dneg()
+{
+  transition(dtos, dtos);
+  __ fnegd(v0, v0);
+}
+
+void TemplateTable::iinc()
+{
+  transition(vtos, vtos);
+  __ load_signed_byte(r1, at_bcp(2)); // get constant
+  locals_index(r2);
+  __ ldr(r0, iaddress(r2));
+  __ addw(r0, r0, r1);
+  __ str(r0, iaddress(r2));
+}
+
+void TemplateTable::wide_iinc()
+{
+  transition(vtos, vtos);
+  // __ mov(r1, zr);
+  __ ldrw(r1, at_bcp(2)); // get constant and index
+  __ rev16(r1, r1);
+  __ ubfx(r2, r1, 0, 16);
+  __ neg(r2, r2);
+  __ sbfx(r1, r1, 16, 16);
+  __ ldr(r0, iaddress(r2));
+  __ addw(r0, r0, r1);
+  __ str(r0, iaddress(r2));
+}
+
+void TemplateTable::convert()
+{
+  // Checking
+#ifdef ASSERT
+  {
+    TosState tos_in  = ilgl;
+    TosState tos_out = ilgl;
+    switch (bytecode()) {
+    case Bytecodes::_i2l: // fall through
+    case Bytecodes::_i2f: // fall through
+    case Bytecodes::_i2d: // fall through
+    case Bytecodes::_i2b: // fall through
+    case Bytecodes::_i2c: // fall through
+    case Bytecodes::_i2s: tos_in = itos; break;
+    case Bytecodes::_l2i: // fall through
+    case Bytecodes::_l2f: // fall through
+    case Bytecodes::_l2d: tos_in = ltos; break;
+    case Bytecodes::_f2i: // fall through
+    case Bytecodes::_f2l: // fall through
+    case Bytecodes::_f2d: tos_in = ftos; break;
+    case Bytecodes::_d2i: // fall through
+    case Bytecodes::_d2l: // fall through
+    case Bytecodes::_d2f: tos_in = dtos; break;
+    default             : ShouldNotReachHere();
+    }
+    switch (bytecode()) {
+    case Bytecodes::_l2i: // fall through
+    case Bytecodes::_f2i: // fall through
+    case Bytecodes::_d2i: // fall through
+    case Bytecodes::_i2b: // fall through
+    case Bytecodes::_i2c: // fall through
+    case Bytecodes::_i2s: tos_out = itos; break;
+    case Bytecodes::_i2l: // fall through
+    case Bytecodes::_f2l: // fall through
+    case Bytecodes::_d2l: tos_out = ltos; break;
+    case Bytecodes::_i2f: // fall through
+    case Bytecodes::_l2f: // fall through
+    case Bytecodes::_d2f: tos_out = ftos; break;
+    case Bytecodes::_i2d: // fall through
+    case Bytecodes::_l2d: // fall through
+    case Bytecodes::_f2d: tos_out = dtos; break;
+    default             : ShouldNotReachHere();
+    }
+    transition(tos_in, tos_out);
+  }
+#endif // ASSERT
+  // static const int64_t is_nan = 0x8000000000000000L;
+
+  // Conversion
+  switch (bytecode()) {
+  case Bytecodes::_i2l:
+    __ sxtw(r0, r0);
+    break;
+  case Bytecodes::_i2f:
+    __ scvtfws(v0, r0);
+    break;
+  case Bytecodes::_i2d:
+    __ scvtfwd(v0, r0);
+    break;
+  case Bytecodes::_i2b:
+    __ sxtbw(r0, r0);
+    break;
+  case Bytecodes::_i2c:
+    __ uxthw(r0, r0);
+    break;
+  case Bytecodes::_i2s:
+    __ sxthw(r0, r0);
+    break;
+  case Bytecodes::_l2i:
+    __ uxtw(r0, r0);
+    break;
+  case Bytecodes::_l2f:
+    __ scvtfs(v0, r0);
+    break;
+  case Bytecodes::_l2d:
+    __ scvtfd(v0, r0);
+    break;
+  case Bytecodes::_f2i:
+  {
+    Label L_Okay;
+    __ clear_fpsr();
+    __ fcvtzsw(r0, v0);
+    __ get_fpsr(r1);
+    __ cbzw(r1, L_Okay);
+    __ call_VM_leaf_base1(CAST_FROM_FN_PTR(address, SharedRuntime::f2i),
+                         0, 1, MacroAssembler::ret_type_integral);
+    __ bind(L_Okay);
+  }
+    break;
+  case Bytecodes::_f2l:
+  {
+    Label L_Okay;
+    __ clear_fpsr();
+    __ fcvtzs(r0, v0);
+    __ get_fpsr(r1);
+    __ cbzw(r1, L_Okay);
+    __ call_VM_leaf_base1(CAST_FROM_FN_PTR(address, SharedRuntime::f2l),
+                         0, 1, MacroAssembler::ret_type_integral);
+    __ bind(L_Okay);
+  }
+    break;
+  case Bytecodes::_f2d:
+    __ fcvts(v0, v0);
+    break;
+  case Bytecodes::_d2i:
+  {
+    Label L_Okay;
+    __ clear_fpsr();
+    __ fcvtzdw(r0, v0);
+    __ get_fpsr(r1);
+    __ cbzw(r1, L_Okay);
+    __ call_VM_leaf_base1(CAST_FROM_FN_PTR(address, SharedRuntime::d2i),
+                         0, 1, MacroAssembler::ret_type_integral);
+    __ bind(L_Okay);
+  }
+    break;
+  case Bytecodes::_d2l:
+  {
+    Label L_Okay;
+    __ clear_fpsr();
+    __ fcvtzd(r0, v0);
+    __ get_fpsr(r1);
+    __ cbzw(r1, L_Okay);
+    __ call_VM_leaf_base1(CAST_FROM_FN_PTR(address, SharedRuntime::d2l),
+                         0, 1, MacroAssembler::ret_type_integral);
+    __ bind(L_Okay);
+  }
+    break;
+  case Bytecodes::_d2f:
+    __ fcvtd(v0, v0);
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+}
+
+void TemplateTable::lcmp()
+{
+  transition(ltos, itos);
+  Label done;
+  __ pop_l(r1);
+  __ cmp(r1, r0);
+  __ mov(r0, (u_int64_t)-1L);
+  __ br(Assembler::LT, done);
+  // __ mov(r0, 1UL);
+  // __ csel(r0, r0, zr, Assembler::NE);
+  // and here is a faster way
+  __ csinc(r0, zr, zr, Assembler::EQ);
+  __ bind(done);
+}
+
+void TemplateTable::float_cmp(bool is_float, int unordered_result)
+{
+  Label done;
+  if (is_float) {
+    // XXX get rid of pop here, use ... reg, mem32
+    __ pop_f(v1);
+    __ fcmps(v1, v0);
+  } else {
+    // XXX get rid of pop here, use ... reg, mem64
+    __ pop_d(v1);
+    __ fcmpd(v1, v0);
+  }
+  if (unordered_result < 0) {
+    // we want -1 for unordered or less than, 0 for equal and 1 for
+    // greater than.
+    __ mov(r0, (u_int64_t)-1L);
+    // for FP LT tests less than or unordered
+    __ br(Assembler::LT, done);
+    // install 0 for EQ otherwise 1
+    __ csinc(r0, zr, zr, Assembler::EQ);
+  } else {
+    // we want -1 for less than, 0 for equal and 1 for unordered or
+    // greater than.
+    __ mov(r0, 1L);
+    // for FP HI tests greater than or unordered
+    __ br(Assembler::HI, done);
+    // install 0 for EQ otherwise ~0
+    __ csinv(r0, zr, zr, Assembler::EQ);
+
+  }
+  __ bind(done);
+}
+
+void TemplateTable::branch(bool is_jsr, bool is_wide)
+{
+  // We might be moving to a safepoint.  The thread which calls
+  // Interpreter::notice_safepoints() will effectively flush its cache
+  // when it makes a system call, but we need to do something to
+  // ensure that we see the changed dispatch table.
+  __ membar(MacroAssembler::LoadLoad);
+
+  __ profile_taken_branch(r0, r1);
+  const ByteSize be_offset = MethodCounters::backedge_counter_offset() +
+                             InvocationCounter::counter_offset();
+  const ByteSize inv_offset = MethodCounters::invocation_counter_offset() +
+                              InvocationCounter::counter_offset();
+
+  // load branch displacement
+  if (!is_wide) {
+    __ ldrh(r2, at_bcp(1));
+    __ rev16(r2, r2);
+    // sign extend the 16 bit value in r2
+    __ sbfm(r2, r2, 0, 15);
+  } else {
+    __ ldrw(r2, at_bcp(1));
+    __ revw(r2, r2);
+    // sign extend the 32 bit value in r2
+    __ sbfm(r2, r2, 0, 31);
+  }
+
+  // Handle all the JSR stuff here, then exit.
+  // It's much shorter and cleaner than intermingling with the non-JSR
+  // normal-branch stuff occurring below.
+
+  if (is_jsr) {
+    // Pre-load the next target bytecode into rscratch1
+    __ load_unsigned_byte(rscratch1, Address(rbcp, r2));
+    // compute return address as bci
+    __ ldr(rscratch2, Address(rmethod, Method::const_offset()));
+    __ add(rscratch2, rscratch2,
+           in_bytes(ConstMethod::codes_offset()) - (is_wide ? 5 : 3));
+    __ sub(r1, rbcp, rscratch2);
+    __ push_i(r1);
+    // Adjust the bcp by the 16-bit displacement in r2
+    __ add(rbcp, rbcp, r2);
+    __ dispatch_only(vtos);
+    return;
+  }
+
+  // Normal (non-jsr) branch handling
+
+  // Adjust the bcp by the displacement in r2
+  __ add(rbcp, rbcp, r2);
+
+  assert(UseLoopCounter || !UseOnStackReplacement,
+         "on-stack-replacement requires loop counters");
+  Label backedge_counter_overflow;
+  Label profile_method;
+  Label dispatch;
+  if (UseLoopCounter) {
+    // increment backedge counter for backward branches
+    // r0: MDO
+    // w1: MDO bumped taken-count
+    // r2: target offset
+    __ cmp(r2, zr);
+    __ br(Assembler::GT, dispatch); // count only if backward branch
+
+    // ECN: FIXME: This code smells
+    // check if MethodCounters exists
+    Label has_counters;
+    __ ldr(rscratch1, Address(rmethod, Method::method_counters_offset()));
+    __ cbnz(rscratch1, has_counters);
+    __ push(r0);
+    __ push(r1);
+    __ push(r2);
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address,
+            InterpreterRuntime::build_method_counters), rmethod);
+    __ pop(r2);
+    __ pop(r1);
+    __ pop(r0);
+    __ ldr(rscratch1, Address(rmethod, Method::method_counters_offset()));
+    __ cbz(rscratch1, dispatch); // No MethodCounters allocated, OutOfMemory
+    __ bind(has_counters);
+
+    if (TieredCompilation) {
+      Label no_mdo;
+      int increment = InvocationCounter::count_increment;
+      int mask = ((1 << Tier0BackedgeNotifyFreqLog) - 1) << InvocationCounter::count_shift;
+      if (ProfileInterpreter) {
+        // Are we profiling?
+        __ ldr(r1, Address(rmethod, in_bytes(Method::method_data_offset())));
+        __ cbz(r1, no_mdo);
+        // Increment the MDO backedge counter
+        const Address mdo_backedge_counter(r1, in_bytes(MethodData::backedge_counter_offset()) +
+                                           in_bytes(InvocationCounter::counter_offset()));
+        __ increment_mask_and_jump(mdo_backedge_counter, increment, mask,
+                                   r0, false, Assembler::EQ, &backedge_counter_overflow);
+        __ b(dispatch);
+      }
+      __ bind(no_mdo);
+      // Increment backedge counter in MethodCounters*
+      __ ldr(rscratch1, Address(rmethod, Method::method_counters_offset()));
+      __ increment_mask_and_jump(Address(rscratch1, be_offset), increment, mask,
+                                 r0, false, Assembler::EQ, &backedge_counter_overflow);
+    } else {
+      // increment counter
+      __ ldr(rscratch2, Address(rmethod, Method::method_counters_offset()));
+      __ ldrw(r0, Address(rscratch2, be_offset));        // load backedge counter
+      __ addw(rscratch1, r0, InvocationCounter::count_increment); // increment counter
+      __ strw(rscratch1, Address(rscratch2, be_offset));        // store counter
+
+      __ ldrw(r0, Address(rscratch2, inv_offset));    // load invocation counter
+      __ andw(r0, r0, (unsigned)InvocationCounter::count_mask_value); // and the status bits
+      __ addw(r0, r0, rscratch1);        // add both counters
+
+      if (ProfileInterpreter) {
+        // Test to see if we should create a method data oop
+        __ lea(rscratch1, ExternalAddress((address) &InvocationCounter::InterpreterProfileLimit));
+        __ ldrw(rscratch1, rscratch1);
+        __ cmpw(r0, rscratch1);
+        __ br(Assembler::LT, dispatch);
+
+        // if no method data exists, go to profile method
+        __ test_method_data_pointer(r0, profile_method);
+
+        if (UseOnStackReplacement) {
+          // check for overflow against w1 which is the MDO taken count
+          __ lea(rscratch1, ExternalAddress((address) &InvocationCounter::InterpreterBackwardBranchLimit));
+          __ ldrw(rscratch1, rscratch1);
+          __ cmpw(r1, rscratch1);
+          __ br(Assembler::LO, dispatch); // Intel == Assembler::below
+
+          // When ProfileInterpreter is on, the backedge_count comes
+          // from the MethodData*, which value does not get reset on
+          // the call to frequency_counter_overflow().  To avoid
+          // excessive calls to the overflow routine while the method is
+          // being compiled, add a second test to make sure the overflow
+          // function is called only once every overflow_frequency.
+          const int overflow_frequency = 1024;
+          __ andsw(r1, r1, overflow_frequency - 1);
+          __ br(Assembler::EQ, backedge_counter_overflow);
+
+        }
+      } else {
+        if (UseOnStackReplacement) {
+          // check for overflow against w0, which is the sum of the
+          // counters
+          __ lea(rscratch1, ExternalAddress((address) &InvocationCounter::InterpreterBackwardBranchLimit));
+          __ ldrw(rscratch1, rscratch1);
+          __ cmpw(r0, rscratch1);
+          __ br(Assembler::HS, backedge_counter_overflow); // Intel == Assembler::aboveEqual
+        }
+      }
+    }
+  }
+  __ bind(dispatch);
+
+  // Pre-load the next target bytecode into rscratch1
+  __ load_unsigned_byte(rscratch1, Address(rbcp, 0));
+
+  // continue with the bytecode @ target
+  // rscratch1: target bytecode
+  // rbcp: target bcp
+  __ dispatch_only(vtos);
+
+  if (UseLoopCounter) {
+    if (ProfileInterpreter) {
+      // Out-of-line code to allocate method data oop.
+      __ bind(profile_method);
+      __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::profile_method));
+      __ load_unsigned_byte(r1, Address(rbcp, 0));  // restore target bytecode
+      __ set_method_data_pointer_for_bcp();
+      __ b(dispatch);
+    }
+
+    if (TieredCompilation || UseOnStackReplacement) {
+      // invocation counter overflow
+      __ bind(backedge_counter_overflow);
+      __ neg(r2, r2);
+      __ add(r2, r2, rbcp);     // branch bcp
+      // IcoResult frequency_counter_overflow([JavaThread*], address branch_bcp)
+      __ call_VM(noreg,
+                 CAST_FROM_FN_PTR(address,
+                                  InterpreterRuntime::frequency_counter_overflow),
+                 r2);
+      if (!UseOnStackReplacement)
+        __ b(dispatch);
+    }
+
+    if (UseOnStackReplacement) {
+      __ load_unsigned_byte(r1, Address(rbcp, 0));  // restore target bytecode
+
+      // r0: osr nmethod (osr ok) or NULL (osr not possible)
+      // w1: target bytecode
+      // r2: scratch
+      __ cbz(r0, dispatch);     // test result -- no osr if null
+      // nmethod may have been invalidated (VM may block upon call_VM return)
+      __ ldrb(r2, Address(r0, nmethod::state_offset()));
+      if (nmethod::in_use != 0)
+        __ sub(r2, r2, nmethod::in_use);
+      __ cbnz(r2, dispatch);
+
+      // We have the address of an on stack replacement routine in r0
+      // We need to prepare to execute the OSR method. First we must
+      // migrate the locals and monitors off of the stack.
+
+      __ mov(r19, r0);                             // save the nmethod
+
+      call_VM(noreg, CAST_FROM_FN_PTR(address, SharedRuntime::OSR_migration_begin));
+
+      // r0 is OSR buffer, move it to expected parameter location
+      __ mov(j_rarg0, r0);
+
+      // remove activation
+      // get sender esp
+      __ ldr(esp,
+          Address(rfp, frame::interpreter_frame_sender_sp_offset * wordSize));
+      // remove frame anchor
+      __ leave();
+      // Ensure compiled code always sees stack at proper alignment
+      __ andr(sp, esp, -16);
+
+      // and begin the OSR nmethod
+      __ ldr(rscratch1, Address(r19, nmethod::osr_entry_point_offset()));
+      __ br(rscratch1);
+    }
+  }
+}
+
+
+void TemplateTable::if_0cmp(Condition cc)
+{
+  transition(itos, vtos);
+  // assume branch is more often taken than not (loops use backward branches)
+  Label not_taken;
+  if (cc == equal)
+    __ cbnzw(r0, not_taken);
+  else if (cc == not_equal)
+    __ cbzw(r0, not_taken);
+  else {
+    __ andsw(zr, r0, r0);
+    __ br(j_not(cc), not_taken);
+  }
+
+  branch(false, false);
+  __ bind(not_taken);
+  __ profile_not_taken_branch(r0);
+}
+
+void TemplateTable::if_icmp(Condition cc)
+{
+  transition(itos, vtos);
+  // assume branch is more often taken than not (loops use backward branches)
+  Label not_taken;
+  __ pop_i(r1);
+  __ cmpw(r1, r0, Assembler::LSL);
+  __ br(j_not(cc), not_taken);
+  branch(false, false);
+  __ bind(not_taken);
+  __ profile_not_taken_branch(r0);
+}
+
+void TemplateTable::if_nullcmp(Condition cc)
+{
+  transition(atos, vtos);
+  // assume branch is more often taken than not (loops use backward branches)
+  Label not_taken;
+  if (cc == equal)
+    __ cbnz(r0, not_taken);
+  else
+    __ cbz(r0, not_taken);
+  branch(false, false);
+  __ bind(not_taken);
+  __ profile_not_taken_branch(r0);
+}
+
+void TemplateTable::if_acmp(Condition cc)
+{
+  transition(atos, vtos);
+  // assume branch is more often taken than not (loops use backward branches)
+  Label not_taken;
+  __ pop_ptr(r1);
+  __ cmp(r1, r0);
+  __ br(j_not(cc), not_taken);
+  branch(false, false);
+  __ bind(not_taken);
+  __ profile_not_taken_branch(r0);
+}
+
+void TemplateTable::ret() {
+  transition(vtos, vtos);
+  // We might be moving to a safepoint.  The thread which calls
+  // Interpreter::notice_safepoints() will effectively flush its cache
+  // when it makes a system call, but we need to do something to
+  // ensure that we see the changed dispatch table.
+  __ membar(MacroAssembler::LoadLoad);
+
+  locals_index(r1);
+  __ ldr(r1, aaddress(r1)); // get return bci, compute return bcp
+  __ profile_ret(r1, r2);
+  __ ldr(rbcp, Address(rmethod, Method::const_offset()));
+  __ lea(rbcp, Address(rbcp, r1));
+  __ add(rbcp, rbcp, in_bytes(ConstMethod::codes_offset()));
+  __ dispatch_next(vtos);
+}
+
+void TemplateTable::wide_ret() {
+  transition(vtos, vtos);
+  locals_index_wide(r1);
+  __ ldr(r1, aaddress(r1)); // get return bci, compute return bcp
+  __ profile_ret(r1, r2);
+  __ ldr(rbcp, Address(rmethod, Method::const_offset()));
+  __ lea(rbcp, Address(rbcp, r1));
+  __ add(rbcp, rbcp, in_bytes(ConstMethod::codes_offset()));
+  __ dispatch_next(vtos);
+}
+
+
+void TemplateTable::tableswitch() {
+  Label default_case, continue_execution;
+  transition(itos, vtos);
+  // align rbcp
+  __ lea(r1, at_bcp(BytesPerInt));
+  __ andr(r1, r1, -BytesPerInt);
+  // load lo & hi
+  __ ldrw(r2, Address(r1, BytesPerInt));
+  __ ldrw(r3, Address(r1, 2 * BytesPerInt));
+  __ rev32(r2, r2);
+  __ rev32(r3, r3);
+  // check against lo & hi
+  __ cmpw(r0, r2);
+  __ br(Assembler::LT, default_case);
+  __ cmpw(r0, r3);
+  __ br(Assembler::GT, default_case);
+  // lookup dispatch offset
+  __ subw(r0, r0, r2);
+  __ lea(r3, Address(r1, r0, Address::uxtw(2)));
+  __ ldrw(r3, Address(r3, 3 * BytesPerInt));
+  __ profile_switch_case(r0, r1, r2);
+  // continue execution
+  __ bind(continue_execution);
+  __ rev32(r3, r3);
+  __ load_unsigned_byte(rscratch1, Address(rbcp, r3, Address::sxtw(0)));
+  __ add(rbcp, rbcp, r3, ext::sxtw);
+  __ dispatch_only(vtos);
+  // handle default
+  __ bind(default_case);
+  __ profile_switch_default(r0);
+  __ ldrw(r3, Address(r1, 0));
+  __ b(continue_execution);
+}
+
+void TemplateTable::lookupswitch() {
+  transition(itos, itos);
+  __ stop("lookupswitch bytecode should have been rewritten");
+}
+
+void TemplateTable::fast_linearswitch() {
+  transition(itos, vtos);
+  Label loop_entry, loop, found, continue_execution;
+  // bswap r0 so we can avoid bswapping the table entries
+  __ rev32(r0, r0);
+  // align rbcp
+  __ lea(r19, at_bcp(BytesPerInt)); // btw: should be able to get rid of
+                                    // this instruction (change offsets
+                                    // below)
+  __ andr(r19, r19, -BytesPerInt);
+  // set counter
+  __ ldrw(r1, Address(r19, BytesPerInt));
+  __ rev32(r1, r1);
+  __ b(loop_entry);
+  // table search
+  __ bind(loop);
+  __ lea(rscratch1, Address(r19, r1, Address::lsl(3)));
+  __ ldrw(rscratch1, Address(rscratch1, 2 * BytesPerInt));
+  __ cmpw(r0, rscratch1);
+  __ br(Assembler::EQ, found);
+  __ bind(loop_entry);
+  __ subs(r1, r1, 1);
+  __ br(Assembler::PL, loop);
+  // default case
+  __ profile_switch_default(r0);
+  __ ldrw(r3, Address(r19, 0));
+  __ b(continue_execution);
+  // entry found -> get offset
+  __ bind(found);
+  __ lea(rscratch1, Address(r19, r1, Address::lsl(3)));
+  __ ldrw(r3, Address(rscratch1, 3 * BytesPerInt));
+  __ profile_switch_case(r1, r0, r19);
+  // continue execution
+  __ bind(continue_execution);
+  __ rev32(r3, r3);
+  __ add(rbcp, rbcp, r3, ext::sxtw);
+  __ ldrb(rscratch1, Address(rbcp, 0));
+  __ dispatch_only(vtos);
+}
+
+void TemplateTable::fast_binaryswitch() {
+  transition(itos, vtos);
+  // Implementation using the following core algorithm:
+  //
+  // int binary_search(int key, LookupswitchPair* array, int n) {
+  //   // Binary search according to "Methodik des Programmierens" by
+  //   // Edsger W. Dijkstra and W.H.J. Feijen, Addison Wesley Germany 1985.
+  //   int i = 0;
+  //   int j = n;
+  //   while (i+1 < j) {
+  //     // invariant P: 0 <= i < j <= n and (a[i] <= key < a[j] or Q)
+  //     // with      Q: for all i: 0 <= i < n: key < a[i]
+  //     // where a stands for the array and assuming that the (inexisting)
+  //     // element a[n] is infinitely big.
+  //     int h = (i + j) >> 1;
+  //     // i < h < j
+  //     if (key < array[h].fast_match()) {
+  //       j = h;
+  //     } else {
+  //       i = h;
+  //     }
+  //   }
+  //   // R: a[i] <= key < a[i+1] or Q
+  //   // (i.e., if key is within array, i is the correct index)
+  //   return i;
+  // }
+
+  // Register allocation
+  const Register key   = r0; // already set (tosca)
+  const Register array = r1;
+  const Register i     = r2;
+  const Register j     = r3;
+  const Register h     = rscratch1;
+  const Register temp  = rscratch2;
+
+  // Find array start
+  __ lea(array, at_bcp(3 * BytesPerInt)); // btw: should be able to
+                                          // get rid of this
+                                          // instruction (change
+                                          // offsets below)
+  __ andr(array, array, -BytesPerInt);
+
+  // Initialize i & j
+  __ mov(i, 0);                            // i = 0;
+  __ ldrw(j, Address(array, -BytesPerInt)); // j = length(array);
+
+  // Convert j into native byteordering
+  __ rev32(j, j);
+
+  // And start
+  Label entry;
+  __ b(entry);
+
+  // binary search loop
+  {
+    Label loop;
+    __ bind(loop);
+    // int h = (i + j) >> 1;
+    __ addw(h, i, j);                           // h = i + j;
+    __ lsrw(h, h, 1);                                   // h = (i + j) >> 1;
+    // if (key < array[h].fast_match()) {
+    //   j = h;
+    // } else {
+    //   i = h;
+    // }
+    // Convert array[h].match to native byte-ordering before compare
+    __ ldr(temp, Address(array, h, Address::lsl(3)));
+    __ rev32(temp, temp);
+    __ cmpw(key, temp);
+    // j = h if (key <  array[h].fast_match())
+    __ csel(j, h, j, Assembler::LT);
+    // i = h if (key >= array[h].fast_match())
+    __ csel(i, h, i, Assembler::GE);
+    // while (i+1 < j)
+    __ bind(entry);
+    __ addw(h, i, 1);          // i+1
+    __ cmpw(h, j);             // i+1 < j
+    __ br(Assembler::LT, loop);
+  }
+
+  // end of binary search, result index is i (must check again!)
+  Label default_case;
+  // Convert array[i].match to native byte-ordering before compare
+  __ ldr(temp, Address(array, i, Address::lsl(3)));
+  __ rev32(temp, temp);
+  __ cmpw(key, temp);
+  __ br(Assembler::NE, default_case);
+
+  // entry found -> j = offset
+  __ add(j, array, i, ext::uxtx, 3);
+  __ ldrw(j, Address(j, BytesPerInt));
+  __ profile_switch_case(i, key, array);
+  __ rev32(j, j);
+  __ load_unsigned_byte(rscratch1, Address(rbcp, j, Address::sxtw(0)));
+  __ lea(rbcp, Address(rbcp, j, Address::sxtw(0)));
+  __ dispatch_only(vtos);
+
+  // default case -> j = default offset
+  __ bind(default_case);
+  __ profile_switch_default(i);
+  __ ldrw(j, Address(array, -2 * BytesPerInt));
+  __ rev32(j, j);
+  __ load_unsigned_byte(rscratch1, Address(rbcp, j, Address::sxtw(0)));
+  __ lea(rbcp, Address(rbcp, j, Address::sxtw(0)));
+  __ dispatch_only(vtos);
+}
+
+
+void TemplateTable::_return(TosState state)
+{
+  transition(state, state);
+  assert(_desc->calls_vm(),
+         "inconsistent calls_vm information"); // call in remove_activation
+
+  if (_desc->bytecode() == Bytecodes::_return_register_finalizer) {
+    assert(state == vtos, "only valid state");
+
+    __ ldr(c_rarg1, aaddress(0));
+    __ load_klass(r3, c_rarg1);
+    __ ldrw(r3, Address(r3, Klass::access_flags_offset()));
+    __ tst(r3, JVM_ACC_HAS_FINALIZER);
+    Label skip_register_finalizer;
+    __ br(Assembler::EQ, skip_register_finalizer);
+
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::register_finalizer), c_rarg1);
+
+    __ bind(skip_register_finalizer);
+  }
+
+  // Issue a StoreStore barrier after all stores but before return
+  // from any constructor for any class with a final field.  We don't
+  // know if this is a finalizer, so we always do so.
+  if (_desc->bytecode() == Bytecodes::_return)
+    __ membar(MacroAssembler::StoreStore);
+
+  __ remove_activation(state);
+  __ ret(lr);
+}
+
+// ----------------------------------------------------------------------------
+// Volatile variables demand their effects be made known to all CPU's
+// in order.  Store buffers on most chips allow reads & writes to
+// reorder; the JMM's ReadAfterWrite.java test fails in -Xint mode
+// without some kind of memory barrier (i.e., it's not sufficient that
+// the interpreter does not reorder volatile references, the hardware
+// also must not reorder them).
+//
+// According to the new Java Memory Model (JMM):
+// (1) All volatiles are serialized wrt to each other.  ALSO reads &
+//     writes act as aquire & release, so:
+// (2) A read cannot let unrelated NON-volatile memory refs that
+//     happen after the read float up to before the read.  It's OK for
+//     non-volatile memory refs that happen before the volatile read to
+//     float down below it.
+// (3) Similar a volatile write cannot let unrelated NON-volatile
+//     memory refs that happen BEFORE the write float down to after the
+//     write.  It's OK for non-volatile memory refs that happen after the
+//     volatile write to float up before it.
+//
+// We only put in barriers around volatile refs (they are expensive),
+// not _between_ memory refs (that would require us to track the
+// flavor of the previous memory refs).  Requirements (2) and (3)
+// require some barriers before volatile stores and after volatile
+// loads.  These nearly cover requirement (1) but miss the
+// volatile-store-volatile-load case.  This final case is placed after
+// volatile-stores although it could just as well go before
+// volatile-loads.
+
+void TemplateTable::resolve_cache_and_index(int byte_no,
+                                            Register Rcache,
+                                            Register index,
+                                            size_t index_size) {
+  const Register temp = r19;
+  assert_different_registers(Rcache, index, temp);
+
+  Label resolved;
+  assert(byte_no == f1_byte || byte_no == f2_byte, "byte_no out of range");
+  __ get_cache_and_index_and_bytecode_at_bcp(Rcache, index, temp, byte_no, 1, index_size);
+  __ cmp(temp, (int) bytecode());  // have we resolved this bytecode?
+  __ br(Assembler::EQ, resolved);
+
+  // resolve first time through
+  address entry;
+  switch (bytecode()) {
+  case Bytecodes::_getstatic:
+  case Bytecodes::_putstatic:
+  case Bytecodes::_getfield:
+  case Bytecodes::_putfield:
+    entry = CAST_FROM_FN_PTR(address, InterpreterRuntime::resolve_get_put);
+    break;
+  case Bytecodes::_invokevirtual:
+  case Bytecodes::_invokespecial:
+  case Bytecodes::_invokestatic:
+  case Bytecodes::_invokeinterface:
+    entry = CAST_FROM_FN_PTR(address, InterpreterRuntime::resolve_invoke);
+    break;
+  case Bytecodes::_invokehandle:
+    entry = CAST_FROM_FN_PTR(address, InterpreterRuntime::resolve_invokehandle);
+    break;
+  case Bytecodes::_invokedynamic:
+    entry = CAST_FROM_FN_PTR(address, InterpreterRuntime::resolve_invokedynamic);
+    break;
+  default:
+    fatal(err_msg("unexpected bytecode: %s", Bytecodes::name(bytecode())));
+    break;
+  }
+  __ mov(temp, (int) bytecode());
+  __ call_VM(noreg, entry, temp);
+
+  // Update registers with resolved info
+  __ get_cache_and_index_at_bcp(Rcache, index, 1, index_size);
+  // n.b. unlike x86 Rcache is now rcpool plus the indexed offset
+  // so all clients ofthis method must be modified accordingly
+  __ bind(resolved);
+}
+
+// The Rcache and index registers must be set before call
+// n.b unlike x86 cache already includes the index offset
+void TemplateTable::load_field_cp_cache_entry(Register obj,
+                                              Register cache,
+                                              Register index,
+                                              Register off,
+                                              Register flags,
+                                              bool is_static = false) {
+  assert_different_registers(cache, index, flags, off);
+
+  ByteSize cp_base_offset = ConstantPoolCache::base_offset();
+  // Field offset
+  __ ldr(off, Address(cache, in_bytes(cp_base_offset +
+                                          ConstantPoolCacheEntry::f2_offset())));
+  // Flags
+  __ ldrw(flags, Address(cache, in_bytes(cp_base_offset +
+                                           ConstantPoolCacheEntry::flags_offset())));
+
+  // klass overwrite register
+  if (is_static) {
+    __ ldr(obj, Address(cache, in_bytes(cp_base_offset +
+                                        ConstantPoolCacheEntry::f1_offset())));
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
+    __ ldr(obj, Address(obj, mirror_offset));
+  }
+}
+
+void TemplateTable::load_invoke_cp_cache_entry(int byte_no,
+                                               Register method,
+                                               Register itable_index,
+                                               Register flags,
+                                               bool is_invokevirtual,
+                                               bool is_invokevfinal, /*unused*/
+                                               bool is_invokedynamic) {
+  // setup registers
+  const Register cache = rscratch2;
+  const Register index = r4;
+  assert_different_registers(method, flags);
+  assert_different_registers(method, cache, index);
+  assert_different_registers(itable_index, flags);
+  assert_different_registers(itable_index, cache, index);
+  // determine constant pool cache field offsets
+  assert(is_invokevirtual == (byte_no == f2_byte), "is_invokevirtual flag redundant");
+  const int method_offset = in_bytes(
+    ConstantPoolCache::base_offset() +
+      (is_invokevirtual
+       ? ConstantPoolCacheEntry::f2_offset()
+       : ConstantPoolCacheEntry::f1_offset()));
+  const int flags_offset = in_bytes(ConstantPoolCache::base_offset() +
+                                    ConstantPoolCacheEntry::flags_offset());
+  // access constant pool cache fields
+  const int index_offset = in_bytes(ConstantPoolCache::base_offset() +
+                                    ConstantPoolCacheEntry::f2_offset());
+
+  size_t index_size = (is_invokedynamic ? sizeof(u4) : sizeof(u2));
+  resolve_cache_and_index(byte_no, cache, index, index_size);
+  __ ldr(method, Address(cache, method_offset));
+
+  if (itable_index != noreg) {
+    __ ldr(itable_index, Address(cache, index_offset));
+  }
+  __ ldrw(flags, Address(cache, flags_offset));
+}
+
+
+// The registers cache and index expected to be set before call.
+// Correct values of the cache and index registers are preserved.
+void TemplateTable::jvmti_post_field_access(Register cache, Register index,
+                                            bool is_static, bool has_tos) {
+  // do the JVMTI work here to avoid disturbing the register state below
+  // We use c_rarg registers here because we want to use the register used in
+  // the call to the VM
+  if (JvmtiExport::can_post_field_access()) {
+    // Check to see if a field access watch has been set before we
+    // take the time to call into the VM.
+    Label L1;
+    assert_different_registers(cache, index, r0);
+    __ lea(rscratch1, ExternalAddress((address) JvmtiExport::get_field_access_count_addr()));
+    __ ldrw(r0, Address(rscratch1));
+    __ cbzw(r0, L1);
+
+    __ get_cache_and_index_at_bcp(c_rarg2, c_rarg3, 1);
+    __ lea(c_rarg2, Address(c_rarg2, in_bytes(ConstantPoolCache::base_offset())));
+
+    if (is_static) {
+      __ mov(c_rarg1, zr); // NULL object reference
+    } else {
+      __ ldr(c_rarg1, at_tos()); // get object pointer without popping it
+      __ verify_oop(c_rarg1);
+    }
+    // c_rarg1: object pointer or NULL
+    // c_rarg2: cache entry pointer
+    // c_rarg3: jvalue object on the stack
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address,
+                                       InterpreterRuntime::post_field_access),
+               c_rarg1, c_rarg2, c_rarg3);
+    __ get_cache_and_index_at_bcp(cache, index, 1);
+    __ bind(L1);
+  }
+}
+
+void TemplateTable::pop_and_check_object(Register r)
+{
+  __ pop_ptr(r);
+  __ null_check(r);  // for field access must check obj.
+  __ verify_oop(r);
+}
+
+void TemplateTable::getfield_or_static(int byte_no, bool is_static)
+{
+  const Register cache = r2;
+  const Register index = r3;
+  const Register obj   = r4;
+  const Register off   = r19;
+  const Register flags = r0;
+  const Register bc    = r4; // uses same reg as obj, so don't mix them
+
+  resolve_cache_and_index(byte_no, cache, index, sizeof(u2));
+  jvmti_post_field_access(cache, index, is_static, false);
+  load_field_cp_cache_entry(obj, cache, index, off, flags, is_static);
+
+  if (!is_static) {
+    // obj is on the stack
+    pop_and_check_object(obj);
+  }
+
+  const Address field(obj, off);
+
+  Label Done, notByte, notInt, notShort, notChar,
+              notLong, notFloat, notObj, notDouble;
+
+  // x86 uses a shift and mask or wings it with a shift plus assert
+  // the mask is not needed. aarch64 just uses bitfield extract
+  __ ubfxw(flags, flags, ConstantPoolCacheEntry::tos_state_shift,  ConstantPoolCacheEntry::tos_state_bits);
+
+  assert(btos == 0, "change code, btos != 0");
+  __ cbnz(flags, notByte);
+
+  // btos
+  __ load_signed_byte(r0, field);
+  __ push(btos);
+  // Rewrite bytecode to be faster
+  if (!is_static) {
+    patch_bytecode(Bytecodes::_fast_bgetfield, bc, r1);
+  }
+  __ b(Done);
+
+  __ bind(notByte);
+  __ cmp(flags, atos);
+  __ br(Assembler::NE, notObj);
+  // atos
+  __ load_heap_oop(r0, field);
+  __ push(atos);
+  if (!is_static) {
+    patch_bytecode(Bytecodes::_fast_agetfield, bc, r1);
+  }
+  __ b(Done);
+
+  __ bind(notObj);
+  __ cmp(flags, itos);
+  __ br(Assembler::NE, notInt);
+  // itos
+  __ ldrw(r0, field);
+  __ push(itos);
+  // Rewrite bytecode to be faster
+  if (!is_static) {
+    patch_bytecode(Bytecodes::_fast_igetfield, bc, r1);
+  }
+  __ b(Done);
+
+  __ bind(notInt);
+  __ cmp(flags, ctos);
+  __ br(Assembler::NE, notChar);
+  // ctos
+  __ load_unsigned_short(r0, field);
+  __ push(ctos);
+  // Rewrite bytecode to be faster
+  if (!is_static) {
+    patch_bytecode(Bytecodes::_fast_cgetfield, bc, r1);
+  }
+  __ b(Done);
+
+  __ bind(notChar);
+  __ cmp(flags, stos);
+  __ br(Assembler::NE, notShort);
+  // stos
+  __ load_signed_short(r0, field);
+  __ push(stos);
+  // Rewrite bytecode to be faster
+  if (!is_static) {
+    patch_bytecode(Bytecodes::_fast_sgetfield, bc, r1);
+  }
+  __ b(Done);
+
+  __ bind(notShort);
+  __ cmp(flags, ltos);
+  __ br(Assembler::NE, notLong);
+  // ltos
+  __ ldr(r0, field);
+  __ push(ltos);
+  // Rewrite bytecode to be faster
+  if (!is_static) {
+    patch_bytecode(Bytecodes::_fast_lgetfield, bc, r1);
+  }
+  __ b(Done);
+
+  __ bind(notLong);
+  __ cmp(flags, ftos);
+  __ br(Assembler::NE, notFloat);
+  // ftos
+  __ ldrs(v0, field);
+  __ push(ftos);
+  // Rewrite bytecode to be faster
+  if (!is_static) {
+    patch_bytecode(Bytecodes::_fast_fgetfield, bc, r1);
+  }
+  __ b(Done);
+
+  __ bind(notFloat);
+#ifdef ASSERT
+  __ cmp(flags, dtos);
+  __ br(Assembler::NE, notDouble);
+#endif
+  // dtos
+  __ ldrd(v0, field);
+  __ push(dtos);
+  // Rewrite bytecode to be faster
+  if (!is_static) {
+    patch_bytecode(Bytecodes::_fast_dgetfield, bc, r1);
+  }
+#ifdef ASSERT
+  __ b(Done);
+
+  __ bind(notDouble);
+  __ stop("Bad state");
+#endif
+
+  __ bind(Done);
+  // It's really not worth bothering to check whether this field
+  // really is volatile in the slow case.
+  __ membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
+}
+
+
+void TemplateTable::getfield(int byte_no)
+{
+  getfield_or_static(byte_no, false);
+}
+
+void TemplateTable::getstatic(int byte_no)
+{
+  getfield_or_static(byte_no, true);
+}
+
+// The registers cache and index expected to be set before call.
+// The function may destroy various registers, just not the cache and index registers.
+void TemplateTable::jvmti_post_field_mod(Register cache, Register index, bool is_static) {
+  transition(vtos, vtos);
+
+  ByteSize cp_base_offset = ConstantPoolCache::base_offset();
+
+  if (JvmtiExport::can_post_field_modification()) {
+    // Check to see if a field modification watch has been set before
+    // we take the time to call into the VM.
+    Label L1;
+    assert_different_registers(cache, index, r0);
+    __ lea(rscratch1, ExternalAddress((address)JvmtiExport::get_field_modification_count_addr()));
+    __ ldrw(r0, Address(rscratch1));
+    __ cbz(r0, L1);
+
+    __ get_cache_and_index_at_bcp(c_rarg2, rscratch1, 1);
+
+    if (is_static) {
+      // Life is simple.  Null out the object pointer.
+      __ mov(c_rarg1, zr);
+    } else {
+      // Life is harder. The stack holds the value on top, followed by
+      // the object.  We don't know the size of the value, though; it
+      // could be one or two words depending on its type. As a result,
+      // we must find the type to determine where the object is.
+      __ ldrw(c_rarg3, Address(c_rarg2,
+                               in_bytes(cp_base_offset +
+                                        ConstantPoolCacheEntry::flags_offset())));
+      __ lsr(c_rarg3, c_rarg3,
+             ConstantPoolCacheEntry::tos_state_shift);
+      ConstantPoolCacheEntry::verify_tos_state_shift();
+      Label nope2, done, ok;
+      __ ldr(c_rarg1, at_tos_p1());  // initially assume a one word jvalue
+      __ cmpw(c_rarg3, ltos);
+      __ br(Assembler::EQ, ok);
+      __ cmpw(c_rarg3, dtos);
+      __ br(Assembler::NE, nope2);
+      __ bind(ok);
+      __ ldr(c_rarg1, at_tos_p2()); // ltos (two word jvalue)
+      __ bind(nope2);
+    }
+    // cache entry pointer
+    __ add(c_rarg2, c_rarg2, in_bytes(cp_base_offset));
+    // object (tos)
+    __ mov(c_rarg3, esp);
+    // c_rarg1: object pointer set up above (NULL if static)
+    // c_rarg2: cache entry pointer
+    // c_rarg3: jvalue object on the stack
+    __ call_VM(noreg,
+               CAST_FROM_FN_PTR(address,
+                                InterpreterRuntime::post_field_modification),
+               c_rarg1, c_rarg2, c_rarg3);
+    __ get_cache_and_index_at_bcp(cache, index, 1);
+    __ bind(L1);
+  }
+}
+
+void TemplateTable::putfield_or_static(int byte_no, bool is_static) {
+  transition(vtos, vtos);
+
+  const Register cache = r2;
+  const Register index = r3;
+  const Register obj   = r2;
+  const Register off   = r19;
+  const Register flags = r0;
+  const Register bc    = r4;
+
+  resolve_cache_and_index(byte_no, cache, index, sizeof(u2));
+  jvmti_post_field_mod(cache, index, is_static);
+  load_field_cp_cache_entry(obj, cache, index, off, flags, is_static);
+
+  Label Done;
+  __ mov(r5, flags);
+
+  {
+    Label notVolatile;
+    __ tbz(r5, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+    __ membar(MacroAssembler::StoreStore);
+    __ bind(notVolatile);
+  }
+
+  // field address
+  const Address field(obj, off);
+
+  Label notByte, notInt, notShort, notChar,
+        notLong, notFloat, notObj, notDouble;
+
+  // x86 uses a shift and mask or wings it with a shift plus assert
+  // the mask is not needed. aarch64 just uses bitfield extract
+  __ ubfxw(flags, flags, ConstantPoolCacheEntry::tos_state_shift,  ConstantPoolCacheEntry::tos_state_bits);
+
+  assert(btos == 0, "change code, btos != 0");
+  __ cbnz(flags, notByte);
+
+  // btos
+  {
+    __ pop(btos);
+    if (!is_static) pop_and_check_object(obj);
+    __ strb(r0, field);
+    if (!is_static) {
+      patch_bytecode(Bytecodes::_fast_bputfield, bc, r1, true, byte_no);
+    }
+    __ b(Done);
+  }
+
+  __ bind(notByte);
+  __ cmp(flags, atos);
+  __ br(Assembler::NE, notObj);
+
+  // atos
+  {
+    __ pop(atos);
+    if (!is_static) pop_and_check_object(obj);
+    // Store into the field
+    do_oop_store(_masm, field, r0, _bs->kind(), false);
+    if (!is_static) {
+      patch_bytecode(Bytecodes::_fast_aputfield, bc, r1, true, byte_no);
+    }
+    __ b(Done);
+  }
+
+  __ bind(notObj);
+  __ cmp(flags, itos);
+  __ br(Assembler::NE, notInt);
+
+  // itos
+  {
+    __ pop(itos);
+    if (!is_static) pop_and_check_object(obj);
+    __ strw(r0, field);
+    if (!is_static) {
+      patch_bytecode(Bytecodes::_fast_iputfield, bc, r1, true, byte_no);
+    }
+    __ b(Done);
+  }
+
+  __ bind(notInt);
+  __ cmp(flags, ctos);
+  __ br(Assembler::NE, notChar);
+
+  // ctos
+  {
+    __ pop(ctos);
+    if (!is_static) pop_and_check_object(obj);
+    __ strh(r0, field);
+    if (!is_static) {
+      patch_bytecode(Bytecodes::_fast_cputfield, bc, r1, true, byte_no);
+    }
+    __ b(Done);
+  }
+
+  __ bind(notChar);
+  __ cmp(flags, stos);
+  __ br(Assembler::NE, notShort);
+
+  // stos
+  {
+    __ pop(stos);
+    if (!is_static) pop_and_check_object(obj);
+    __ strh(r0, field);
+    if (!is_static) {
+      patch_bytecode(Bytecodes::_fast_sputfield, bc, r1, true, byte_no);
+    }
+    __ b(Done);
+  }
+
+  __ bind(notShort);
+  __ cmp(flags, ltos);
+  __ br(Assembler::NE, notLong);
+
+  // ltos
+  {
+    __ pop(ltos);
+    if (!is_static) pop_and_check_object(obj);
+    __ str(r0, field);
+    if (!is_static) {
+      patch_bytecode(Bytecodes::_fast_lputfield, bc, r1, true, byte_no);
+    }
+    __ b(Done);
+  }
+
+  __ bind(notLong);
+  __ cmp(flags, ftos);
+  __ br(Assembler::NE, notFloat);
+
+  // ftos
+  {
+    __ pop(ftos);
+    if (!is_static) pop_and_check_object(obj);
+    __ strs(v0, field);
+    if (!is_static) {
+      patch_bytecode(Bytecodes::_fast_fputfield, bc, r1, true, byte_no);
+    }
+    __ b(Done);
+  }
+
+  __ bind(notFloat);
+#ifdef ASSERT
+  __ cmp(flags, dtos);
+  __ br(Assembler::NE, notDouble);
+#endif
+
+  // dtos
+  {
+    __ pop(dtos);
+    if (!is_static) pop_and_check_object(obj);
+    __ strd(v0, field);
+    if (!is_static) {
+      patch_bytecode(Bytecodes::_fast_dputfield, bc, r1, true, byte_no);
+    }
+  }
+
+#ifdef ASSERT
+  __ b(Done);
+
+  __ bind(notDouble);
+  __ stop("Bad state");
+#endif
+
+  __ bind(Done);
+
+  {
+    Label notVolatile;
+    __ tbz(r5, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+    __ membar(MacroAssembler::StoreLoad);
+    __ bind(notVolatile);
+  }
+}
+
+void TemplateTable::putfield(int byte_no)
+{
+  putfield_or_static(byte_no, false);
+}
+
+void TemplateTable::putstatic(int byte_no) {
+  putfield_or_static(byte_no, true);
+}
+
+void TemplateTable::jvmti_post_fast_field_mod()
+{
+  if (JvmtiExport::can_post_field_modification()) {
+    // Check to see if a field modification watch has been set before
+    // we take the time to call into the VM.
+    Label L2;
+    __ lea(rscratch1, ExternalAddress((address)JvmtiExport::get_field_modification_count_addr()));
+    __ ldrw(c_rarg3, Address(rscratch1));
+    __ cbzw(c_rarg3, L2);
+    __ pop_ptr(r19);                  // copy the object pointer from tos
+    __ verify_oop(r19);
+    __ push_ptr(r19);                 // put the object pointer back on tos
+    // Save tos values before call_VM() clobbers them. Since we have
+    // to do it for every data type, we use the saved values as the
+    // jvalue object.
+    switch (bytecode()) {          // load values into the jvalue object
+    case Bytecodes::_fast_aputfield: __ push_ptr(r0); break;
+    case Bytecodes::_fast_bputfield: // fall through
+    case Bytecodes::_fast_sputfield: // fall through
+    case Bytecodes::_fast_cputfield: // fall through
+    case Bytecodes::_fast_iputfield: __ push_i(r0); break;
+    case Bytecodes::_fast_dputfield: __ push_d(); break;
+    case Bytecodes::_fast_fputfield: __ push_f(); break;
+    case Bytecodes::_fast_lputfield: __ push_l(r0); break;
+
+    default:
+      ShouldNotReachHere();
+    }
+    __ mov(c_rarg3, esp);             // points to jvalue on the stack
+    // access constant pool cache entry
+    __ get_cache_entry_pointer_at_bcp(c_rarg2, r0, 1);
+    __ verify_oop(r19);
+    // r19: object pointer copied above
+    // c_rarg2: cache entry pointer
+    // c_rarg3: jvalue object on the stack
+    __ call_VM(noreg,
+               CAST_FROM_FN_PTR(address,
+                                InterpreterRuntime::post_field_modification),
+               r19, c_rarg2, c_rarg3);
+
+    switch (bytecode()) {             // restore tos values
+    case Bytecodes::_fast_aputfield: __ pop_ptr(r0); break;
+    case Bytecodes::_fast_bputfield: // fall through
+    case Bytecodes::_fast_sputfield: // fall through
+    case Bytecodes::_fast_cputfield: // fall through
+    case Bytecodes::_fast_iputfield: __ pop_i(r0); break;
+    case Bytecodes::_fast_dputfield: __ pop_d(); break;
+    case Bytecodes::_fast_fputfield: __ pop_f(); break;
+    case Bytecodes::_fast_lputfield: __ pop_l(r0); break;
+    }
+    __ bind(L2);
+  }
+}
+
+void TemplateTable::fast_storefield(TosState state)
+{
+  transition(state, vtos);
+
+  ByteSize base = ConstantPoolCache::base_offset();
+
+  jvmti_post_fast_field_mod();
+
+  // access constant pool cache
+  __ get_cache_and_index_at_bcp(r2, r1, 1);
+
+  // test for volatile with r3
+  __ ldrw(r3, Address(r2, in_bytes(base +
+                                   ConstantPoolCacheEntry::flags_offset())));
+
+  // replace index with field offset from cache entry
+  __ ldr(r1, Address(r2, in_bytes(base + ConstantPoolCacheEntry::f2_offset())));
+
+  {
+    Label notVolatile;
+    __ tbz(r3, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+    __ membar(MacroAssembler::StoreStore);
+    __ bind(notVolatile);
+  }
+
+  Label notVolatile;
+
+  // Get object from stack
+  pop_and_check_object(r2);
+
+  // field address
+  const Address field(r2, r1);
+
+  // access field
+  switch (bytecode()) {
+  case Bytecodes::_fast_aputfield:
+    do_oop_store(_masm, field, r0, _bs->kind(), false);
+    break;
+  case Bytecodes::_fast_lputfield:
+    __ str(r0, field);
+    break;
+  case Bytecodes::_fast_iputfield:
+    __ strw(r0, field);
+    break;
+  case Bytecodes::_fast_bputfield:
+    __ strb(r0, field);
+    break;
+  case Bytecodes::_fast_sputfield:
+    // fall through
+  case Bytecodes::_fast_cputfield:
+    __ strh(r0, field);
+    break;
+  case Bytecodes::_fast_fputfield:
+    __ strs(v0, field);
+    break;
+  case Bytecodes::_fast_dputfield:
+    __ strd(v0, field);
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+
+  {
+    Label notVolatile;
+    __ tbz(r3, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+    __ membar(MacroAssembler::StoreLoad);
+    __ bind(notVolatile);
+  }
+}
+
+
+void TemplateTable::fast_accessfield(TosState state)
+{
+  transition(atos, state);
+  // Do the JVMTI work here to avoid disturbing the register state below
+  if (JvmtiExport::can_post_field_access()) {
+    // Check to see if a field access watch has been set before we
+    // take the time to call into the VM.
+    Label L1;
+    __ lea(rscratch1, ExternalAddress((address) JvmtiExport::get_field_access_count_addr()));
+    __ ldrw(r2, Address(rscratch1));
+    __ cbzw(r2, L1);
+    // access constant pool cache entry
+    __ get_cache_entry_pointer_at_bcp(c_rarg2, rscratch2, 1);
+    __ verify_oop(r0);
+    __ push_ptr(r0);  // save object pointer before call_VM() clobbers it
+    __ mov(c_rarg1, r0);
+    // c_rarg1: object pointer copied above
+    // c_rarg2: cache entry pointer
+    __ call_VM(noreg,
+               CAST_FROM_FN_PTR(address,
+                                InterpreterRuntime::post_field_access),
+               c_rarg1, c_rarg2);
+    __ pop_ptr(r0); // restore object pointer
+    __ bind(L1);
+  }
+
+  // access constant pool cache
+  __ get_cache_and_index_at_bcp(r2, r1, 1);
+  __ ldr(r1, Address(r2, in_bytes(ConstantPoolCache::base_offset() +
+                                  ConstantPoolCacheEntry::f2_offset())));
+  __ ldrw(r3, Address(r2, in_bytes(ConstantPoolCache::base_offset() +
+                                   ConstantPoolCacheEntry::flags_offset())));
+
+  // r0: object
+  __ verify_oop(r0);
+  __ null_check(r0);
+  const Address field(r0, r1);
+
+  // access field
+  switch (bytecode()) {
+  case Bytecodes::_fast_agetfield:
+    __ load_heap_oop(r0, field);
+    __ verify_oop(r0);
+    break;
+  case Bytecodes::_fast_lgetfield:
+    __ ldr(r0, field);
+    break;
+  case Bytecodes::_fast_igetfield:
+    __ ldrw(r0, field);
+    break;
+  case Bytecodes::_fast_bgetfield:
+    __ load_signed_byte(r0, field);
+    break;
+  case Bytecodes::_fast_sgetfield:
+    __ load_signed_short(r0, field);
+    break;
+  case Bytecodes::_fast_cgetfield:
+    __ load_unsigned_short(r0, field);
+    break;
+  case Bytecodes::_fast_fgetfield:
+    __ ldrs(v0, field);
+    break;
+  case Bytecodes::_fast_dgetfield:
+    __ ldrd(v0, field);
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+  {
+    Label notVolatile;
+    __ tbz(r3, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+    __ membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
+    __ bind(notVolatile);
+  }
+}
+
+void TemplateTable::fast_xaccess(TosState state)
+{
+  transition(vtos, state);
+
+  // get receiver
+  __ ldr(r0, aaddress(0));
+  // access constant pool cache
+  __ get_cache_and_index_at_bcp(r2, r3, 2);
+  __ ldr(r1, Address(r2, in_bytes(ConstantPoolCache::base_offset() +
+                                  ConstantPoolCacheEntry::f2_offset())));
+  // make sure exception is reported in correct bcp range (getfield is
+  // next instruction)
+  __ increment(rbcp);
+  __ null_check(r0);
+  switch (state) {
+  case itos:
+    __ ldr(r0, Address(r0, r1, Address::lsl(0)));
+    break;
+  case atos:
+    __ load_heap_oop(r0, Address(r0, r1, Address::lsl(0)));
+    __ verify_oop(r0);
+    break;
+  case ftos:
+    __ ldrs(v0, Address(r0, r1, Address::lsl(0)));
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+
+  {
+    Label notVolatile;
+    __ ldrw(r3, Address(r2, in_bytes(ConstantPoolCache::base_offset() +
+                                     ConstantPoolCacheEntry::flags_offset())));
+    __ tbz(r3, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+    __ membar(MacroAssembler::LoadLoad);
+    __ bind(notVolatile);
+  }
+
+  __ decrement(rbcp);
+}
+
+
+
+//-----------------------------------------------------------------------------
+// Calls
+
+void TemplateTable::count_calls(Register method, Register temp)
+{
+  __ call_Unimplemented();
+}
+
+void TemplateTable::prepare_invoke(int byte_no,
+                                   Register method, // linked method (or i-klass)
+                                   Register index,  // itable index, MethodType, etc.
+                                   Register recv,   // if caller wants to see it
+                                   Register flags   // if caller wants to test it
+                                   ) {
+  // determine flags
+  Bytecodes::Code code = bytecode();
+  const bool is_invokeinterface  = code == Bytecodes::_invokeinterface;
+  const bool is_invokedynamic    = code == Bytecodes::_invokedynamic;
+  const bool is_invokehandle     = code == Bytecodes::_invokehandle;
+  const bool is_invokevirtual    = code == Bytecodes::_invokevirtual;
+  const bool is_invokespecial    = code == Bytecodes::_invokespecial;
+  const bool load_receiver       = (recv  != noreg);
+  const bool save_flags          = (flags != noreg);
+  assert(load_receiver == (code != Bytecodes::_invokestatic && code != Bytecodes::_invokedynamic), "");
+  assert(save_flags    == (is_invokeinterface || is_invokevirtual), "need flags for vfinal");
+  assert(flags == noreg || flags == r3, "");
+  assert(recv  == noreg || recv  == r2, "");
+
+  // setup registers & access constant pool cache
+  if (recv  == noreg)  recv  = r2;
+  if (flags == noreg)  flags = r3;
+  assert_different_registers(method, index, recv, flags);
+
+  // save 'interpreter return address'
+  __ save_bcp();
+
+  load_invoke_cp_cache_entry(byte_no, method, index, flags, is_invokevirtual, false, is_invokedynamic);
+
+  // maybe push appendix to arguments (just before return address)
+  if (is_invokedynamic || is_invokehandle) {
+    Label L_no_push;
+    __ tbz(flags, ConstantPoolCacheEntry::has_appendix_shift, L_no_push);
+    // Push the appendix as a trailing parameter.
+    // This must be done before we get the receiver,
+    // since the parameter_size includes it.
+    __ push(r19);
+    __ mov(r19, index);
+    assert(ConstantPoolCacheEntry::_indy_resolved_references_appendix_offset == 0, "appendix expected at index+0");
+    __ load_resolved_reference_at_index(index, r19);
+    __ pop(r19);
+    __ push(index);  // push appendix (MethodType, CallSite, etc.)
+    __ bind(L_no_push);
+  }
+
+  // load receiver if needed (note: no return address pushed yet)
+  if (load_receiver) {
+    __ andw(recv, flags, ConstantPoolCacheEntry::parameter_size_mask);
+    // FIXME -- is this actually correct? looks like it should be 2
+    // const int no_return_pc_pushed_yet = -1;  // argument slot correction before we push return address
+    // const int receiver_is_at_end      = -1;  // back off one slot to get receiver
+    // Address recv_addr = __ argument_address(recv, no_return_pc_pushed_yet + receiver_is_at_end);
+    // __ movptr(recv, recv_addr);
+    __ add(rscratch1, esp, recv, ext::uxtx, 3); // FIXME: uxtb here?
+    __ ldr(recv, Address(rscratch1, -Interpreter::expr_offset_in_bytes(1)));
+    __ verify_oop(recv);
+  }
+
+  // compute return type
+  // x86 uses a shift and mask or wings it with a shift plus assert
+  // the mask is not needed. aarch64 just uses bitfield extract
+  __ ubfxw(rscratch2, flags, ConstantPoolCacheEntry::tos_state_shift,  ConstantPoolCacheEntry::tos_state_bits);
+  // load return address
+  {
+    const address table_addr = (address) Interpreter::invoke_return_entry_table_for(code);
+    __ mov(rscratch1, table_addr);
+    __ ldr(lr, Address(rscratch1, rscratch2, Address::lsl(3)));
+  }
+}
+
+
+void TemplateTable::invokevirtual_helper(Register index,
+                                         Register recv,
+                                         Register flags)
+{
+  // Uses temporary registers r0, r3
+  assert_different_registers(index, recv, r0, r3);
+  // Test for an invoke of a final method
+  Label notFinal;
+  __ tbz(flags, ConstantPoolCacheEntry::is_vfinal_shift, notFinal);
+
+  const Register method = index;  // method must be rmethod
+  assert(method == rmethod,
+         "methodOop must be rmethod for interpreter calling convention");
+
+  // do the call - the index is actually the method to call
+  // that is, f2 is a vtable index if !is_vfinal, else f2 is a Method*
+
+  // It's final, need a null check here!
+  __ null_check(recv);
+
+  // profile this call
+  __ profile_final_call(r0);
+  __ profile_arguments_type(r0, method, r4, true);
+
+  __ jump_from_interpreted(method, r0);
+
+  __ bind(notFinal);
+
+  // get receiver klass
+  __ null_check(recv, oopDesc::klass_offset_in_bytes());
+  __ load_klass(r0, recv);
+
+  // profile this call
+  __ profile_virtual_call(r0, rlocals, r3);
+
+  // get target methodOop & entry point
+  __ lookup_virtual_method(r0, index, method);
+  __ profile_arguments_type(r3, method, r4, true);
+  // FIXME -- this looks completely redundant. is it?
+  // __ ldr(r3, Address(method, Method::interpreter_entry_offset()));
+  __ jump_from_interpreted(method, r3);
+}
+
+void TemplateTable::invokevirtual(int byte_no)
+{
+  transition(vtos, vtos);
+  assert(byte_no == f2_byte, "use this argument");
+
+  prepare_invoke(byte_no, rmethod, noreg, r2, r3);
+
+  // rmethod: index (actually a Method*)
+  // r2: receiver
+  // r3: flags
+
+  invokevirtual_helper(rmethod, r2, r3);
+}
+
+void TemplateTable::invokespecial(int byte_no)
+{
+  transition(vtos, vtos);
+  assert(byte_no == f1_byte, "use this argument");
+
+  prepare_invoke(byte_no, rmethod, noreg,  // get f1 Method*
+                 r2);  // get receiver also for null check
+  __ verify_oop(r2);
+  __ null_check(r2);
+  // do the call
+  __ profile_call(r0);
+  __ profile_arguments_type(r0, rmethod, rbcp, false);
+  __ jump_from_interpreted(rmethod, r0);
+}
+
+void TemplateTable::invokestatic(int byte_no)
+{
+  transition(vtos, vtos);
+  assert(byte_no == f1_byte, "use this argument");
+
+  prepare_invoke(byte_no, rmethod);  // get f1 Method*
+  // do the call
+  __ profile_call(r0);
+  __ profile_arguments_type(r0, rmethod, r4, false);
+  __ jump_from_interpreted(rmethod, r0);
+}
+
+void TemplateTable::fast_invokevfinal(int byte_no)
+{
+  __ call_Unimplemented();
+}
+
+void TemplateTable::invokeinterface(int byte_no) {
+  transition(vtos, vtos);
+  assert(byte_no == f1_byte, "use this argument");
+
+  prepare_invoke(byte_no, r0, rmethod,  // get f1 Klass*, f2 itable index
+                 r2, r3); // recv, flags
+
+  // r0: interface klass (from f1)
+  // rmethod: itable index (from f2)
+  // r2: receiver
+  // r3: flags
+
+  // Special case of invokeinterface called for virtual method of
+  // java.lang.Object.  See cpCacheOop.cpp for details.
+  // This code isn't produced by javac, but could be produced by
+  // another compliant java compiler.
+  Label notMethod;
+  __ tbz(r3, ConstantPoolCacheEntry::is_forced_virtual_shift, notMethod);
+
+  invokevirtual_helper(rmethod, r2, r3);
+  __ bind(notMethod);
+
+  // Get receiver klass into r3 - also a null check
+  __ restore_locals();
+  __ null_check(r2, oopDesc::klass_offset_in_bytes());
+  __ load_klass(r3, r2);
+
+  // profile this call
+  __ profile_virtual_call(r3, r13, r19);
+
+  Label no_such_interface, no_such_method;
+
+  __ lookup_interface_method(// inputs: rec. class, interface, itable index
+                             r3, r0, rmethod,
+                             // outputs: method, scan temp. reg
+                             rmethod, r13,
+                             no_such_interface);
+
+  // rmethod,: methodOop to call
+  // r2: receiver
+  // Check for abstract method error
+  // Note: This should be done more efficiently via a throw_abstract_method_error
+  //       interpreter entry point and a conditional jump to it in case of a null
+  //       method.
+  __ cbz(rmethod, no_such_method);
+
+  __ profile_arguments_type(r3, rmethod, r13, true);
+
+  // do the call
+  // r2: receiver
+  // rmethod,: methodOop
+  __ jump_from_interpreted(rmethod, r3);
+  __ should_not_reach_here();
+
+  // exception handling code follows...
+  // note: must restore interpreter registers to canonical
+  //       state for exception handling to work correctly!
+
+  __ bind(no_such_method);
+  // throw exception
+  __ restore_bcp();      // bcp must be correct for exception handler   (was destroyed)
+  __ restore_locals();   // make sure locals pointer is correct as well (was destroyed)
+  __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_AbstractMethodError));
+  // the call_VM checks for exception, so we should never return here.
+  __ should_not_reach_here();
+
+  __ bind(no_such_interface);
+  // throw exception
+  __ restore_bcp();      // bcp must be correct for exception handler   (was destroyed)
+  __ restore_locals();   // make sure locals pointer is correct as well (was destroyed)
+  __ call_VM(noreg, CAST_FROM_FN_PTR(address,
+                   InterpreterRuntime::throw_IncompatibleClassChangeError));
+  // the call_VM checks for exception, so we should never return here.
+  __ should_not_reach_here();
+  return;
+}
+
+void TemplateTable::invokehandle(int byte_no) {
+  transition(vtos, vtos);
+  assert(byte_no == f1_byte, "use this argument");
+
+  prepare_invoke(byte_no, rmethod, r0, r2);
+  __ verify_method_ptr(r2);
+  __ verify_oop(r2);
+  __ null_check(r2);
+
+  // FIXME: profile the LambdaForm also
+
+  // r13 is safe to use here as a scratch reg because it is about to
+  // be clobbered by jump_from_interpreted().
+  __ profile_final_call(r13);
+  __ profile_arguments_type(r13, rmethod, r4, true);
+
+  __ jump_from_interpreted(rmethod, r0);
+}
+
+void TemplateTable::invokedynamic(int byte_no) {
+  transition(vtos, vtos);
+  assert(byte_no == f1_byte, "use this argument");
+
+  prepare_invoke(byte_no, rmethod, r0);
+
+  // r0: CallSite object (from cpool->resolved_references[])
+  // rmethod: MH.linkToCallSite method (from f2)
+
+  // Note:  r0_callsite is already pushed by prepare_invoke
+
+  // %%% should make a type profile for any invokedynamic that takes a ref argument
+  // profile this call
+  __ profile_call(rbcp);
+  __ profile_arguments_type(r3, rmethod, r13, false);
+
+  __ verify_oop(r0);
+
+  __ jump_from_interpreted(rmethod, r0);
+}
+
+
+//-----------------------------------------------------------------------------
+// Allocation
+
+void TemplateTable::_new() {
+  transition(vtos, atos);
+
+  __ get_unsigned_2_byte_index_at_bcp(r3, 1);
+  Label slow_case;
+  Label done;
+  Label initialize_header;
+  Label initialize_object; // including clearing the fields
+  Label allocate_shared;
+
+  __ get_cpool_and_tags(r4, r0);
+  // Make sure the class we're about to instantiate has been resolved.
+  // This is done before loading InstanceKlass to be consistent with the order
+  // how Constant Pool is updated (see ConstantPool::klass_at_put)
+  const int tags_offset = Array<u1>::base_offset_in_bytes();
+  __ lea(rscratch1, Address(r0, r3, Address::lsl(0)));
+  __ ldrb(rscratch1, Address(rscratch1, tags_offset));
+  __ cmp(rscratch1, JVM_CONSTANT_Class);
+  __ br(Assembler::NE, slow_case);
+
+  // get InstanceKlass
+  __ lea(r4, Address(r4, r3, Address::lsl(3)));
+  __ ldr(r4, Address(r4, sizeof(ConstantPool)));
+
+  // make sure klass is initialized & doesn't have finalizer
+  // make sure klass is fully initialized
+  __ ldrb(rscratch1, Address(r4, InstanceKlass::init_state_offset()));
+  __ cmp(rscratch1, InstanceKlass::fully_initialized);
+  __ br(Assembler::NE, slow_case);
+
+  // get instance_size in InstanceKlass (scaled to a count of bytes)
+  __ ldrw(r3,
+          Address(r4,
+                  Klass::layout_helper_offset()));
+  // test to see if it has a finalizer or is malformed in some way
+  __ tbnz(r3, exact_log2(Klass::_lh_instance_slow_path_bit), slow_case);
+
+  // Allocate the instance
+  // 1) Try to allocate in the TLAB
+  // 2) if fail and the object is large allocate in the shared Eden
+  // 3) if the above fails (or is not applicable), go to a slow case
+  // (creates a new TLAB, etc.)
+
+  const bool allow_shared_alloc =
+    Universe::heap()->supports_inline_contig_alloc();
+
+  if (UseTLAB) {
+    __ tlab_allocate(r0, r3, 0, noreg, r1,
+                     allow_shared_alloc ? allocate_shared : slow_case);
+
+    if (ZeroTLAB) {
+      // the fields have been already cleared
+      __ b(initialize_header);
+    } else {
+      // initialize both the header and fields
+      __ b(initialize_object);
+    }
+  }
+
+  // Allocation in the shared Eden, if allowed.
+  //
+  // r3: instance size in bytes
+  if (allow_shared_alloc) {
+    __ bind(allocate_shared);
+
+    __ eden_allocate(r0, r3, 0, r10, slow_case);
+    __ incr_allocated_bytes(rthread, r3, 0, rscratch1);
+  }
+
+  if (UseTLAB || Universe::heap()->supports_inline_contig_alloc()) {
+    // The object is initialized before the header.  If the object size is
+    // zero, go directly to the header initialization.
+    __ bind(initialize_object);
+    __ sub(r3, r3, sizeof(oopDesc));
+    __ cbz(r3, initialize_header);
+
+    // Initialize object fields
+    {
+      __ add(r2, r0, sizeof(oopDesc));
+      Label loop;
+      __ bind(loop);
+      __ str(zr, Address(__ post(r2, BytesPerLong)));
+      __ sub(r3, r3, BytesPerLong);
+      __ cbnz(r3, loop);
+    }
+
+    // initialize object header only.
+    __ bind(initialize_header);
+    if (UseBiasedLocking) {
+      __ ldr(rscratch1, Address(r4, Klass::prototype_header_offset()));
+    } else {
+      __ mov(rscratch1, (intptr_t)markOopDesc::prototype());
+    }
+    __ str(rscratch1, Address(r0, oopDesc::mark_offset_in_bytes()));
+    __ store_klass_gap(r0, zr);  // zero klass gap for compressed oops
+    __ store_klass(r0, r4);      // store klass last
+
+    {
+      SkipIfEqual skip(_masm, &DTraceAllocProbes, false);
+      // Trigger dtrace event for fastpath
+      __ push(atos); // save the return value
+      __ call_VM_leaf(
+           CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_object_alloc), r0);
+      __ pop(atos); // restore the return value
+
+    }
+    __ b(done);
+  }
+
+  // slow case
+  __ bind(slow_case);
+  __ get_constant_pool(c_rarg1);
+  __ get_unsigned_2_byte_index_at_bcp(c_rarg2, 1);
+  call_VM(r0, CAST_FROM_FN_PTR(address, InterpreterRuntime::_new), c_rarg1, c_rarg2);
+  __ verify_oop(r0);
+
+  // continue
+  __ bind(done);
+  // Must prevent reordering of stores for object initialization with stores that publish the new object.
+  __ membar(Assembler::StoreStore);
+}
+
+void TemplateTable::newarray() {
+  transition(itos, atos);
+  __ load_unsigned_byte(c_rarg1, at_bcp(1));
+  __ mov(c_rarg2, r0);
+  call_VM(r0, CAST_FROM_FN_PTR(address, InterpreterRuntime::newarray),
+          c_rarg1, c_rarg2);
+  // Must prevent reordering of stores for object initialization with stores that publish the new object.
+  __ membar(Assembler::StoreStore);
+}
+
+void TemplateTable::anewarray() {
+  transition(itos, atos);
+  __ get_unsigned_2_byte_index_at_bcp(c_rarg2, 1);
+  __ get_constant_pool(c_rarg1);
+  __ mov(c_rarg3, r0);
+  call_VM(r0, CAST_FROM_FN_PTR(address, InterpreterRuntime::anewarray),
+          c_rarg1, c_rarg2, c_rarg3);
+  // Must prevent reordering of stores for object initialization with stores that publish the new object.
+  __ membar(Assembler::StoreStore);
+}
+
+void TemplateTable::arraylength() {
+  transition(atos, itos);
+  __ null_check(r0, arrayOopDesc::length_offset_in_bytes());
+  __ ldrw(r0, Address(r0, arrayOopDesc::length_offset_in_bytes()));
+}
+
+void TemplateTable::checkcast()
+{
+  transition(atos, atos);
+  Label done, is_null, ok_is_subtype, quicked, resolved;
+  __ cbz(r0, is_null);
+
+  // Get cpool & tags index
+  __ get_cpool_and_tags(r2, r3); // r2=cpool, r3=tags array
+  __ get_unsigned_2_byte_index_at_bcp(r19, 1); // r19=index
+  // See if bytecode has already been quicked
+  __ add(rscratch1, r3, Array<u1>::base_offset_in_bytes());
+  __ ldrb(r1, Address(rscratch1, r19));
+  __ cmp(r1, JVM_CONSTANT_Class);
+  __ br(Assembler::EQ, quicked);
+
+  __ push(atos); // save receiver for result, and for GC
+  call_VM(r0, CAST_FROM_FN_PTR(address, InterpreterRuntime::quicken_io_cc));
+  // vm_result_2 has metadata result
+  __ get_vm_result_2(r0, rthread);
+  __ pop(r3); // restore receiver
+  __ b(resolved);
+
+  // Get superklass in r0 and subklass in r3
+  __ bind(quicked);
+  __ mov(r3, r0); // Save object in r3; r0 needed for subtype check
+  __ lea(r0, Address(r2, r19, Address::lsl(3)));
+  __ ldr(r0, Address(r0, sizeof(ConstantPool)));
+
+  __ bind(resolved);
+  __ load_klass(r19, r3);
+
+  // Generate subtype check.  Blows r2, r5.  Object in r3.
+  // Superklass in r0.  Subklass in r19.
+  __ gen_subtype_check(r19, ok_is_subtype);
+
+  // Come here on failure
+  __ push(r3);
+  // object is at TOS
+  __ b(Interpreter::_throw_ClassCastException_entry);
+
+  // Come here on success
+  __ bind(ok_is_subtype);
+  __ mov(r0, r3); // Restore object in r3
+
+  // Collect counts on whether this test sees NULLs a lot or not.
+  if (ProfileInterpreter) {
+    __ b(done);
+    __ bind(is_null);
+    __ profile_null_seen(r2);
+  } else {
+    __ bind(is_null);   // same as 'done'
+  }
+  __ bind(done);
+}
+
+void TemplateTable::instanceof() {
+  transition(atos, itos);
+  Label done, is_null, ok_is_subtype, quicked, resolved;
+  __ cbz(r0, is_null);
+
+  // Get cpool & tags index
+  __ get_cpool_and_tags(r2, r3); // r2=cpool, r3=tags array
+  __ get_unsigned_2_byte_index_at_bcp(r19, 1); // r19=index
+  // See if bytecode has already been quicked
+  __ add(rscratch1, r3, Array<u1>::base_offset_in_bytes());
+  __ ldrb(r1, Address(rscratch1, r19));
+  __ cmp(r1, JVM_CONSTANT_Class);
+  __ br(Assembler::EQ, quicked);
+
+  __ push(atos); // save receiver for result, and for GC
+  call_VM(r0, CAST_FROM_FN_PTR(address, InterpreterRuntime::quicken_io_cc));
+  // vm_result_2 has metadata result
+  __ get_vm_result_2(r0, rthread);
+  __ pop(r3); // restore receiver
+  __ verify_oop(r3);
+  __ load_klass(r3, r3);
+  __ b(resolved);
+
+  // Get superklass in r0 and subklass in r3
+  __ bind(quicked);
+  __ load_klass(r3, r0);
+  __ lea(r0, Address(r2, r19, Address::lsl(3)));
+  __ ldr(r0, Address(r0, sizeof(ConstantPool)));
+
+  __ bind(resolved);
+
+  // Generate subtype check.  Blows r2, r5
+  // Superklass in r0.  Subklass in r3.
+  __ gen_subtype_check(r3, ok_is_subtype);
+
+  // Come here on failure
+  __ mov(r0, 0);
+  __ b(done);
+  // Come here on success
+  __ bind(ok_is_subtype);
+  __ mov(r0, 1);
+
+  // Collect counts on whether this test sees NULLs a lot or not.
+  if (ProfileInterpreter) {
+    __ b(done);
+    __ bind(is_null);
+    __ profile_null_seen(r2);
+  } else {
+    __ bind(is_null);   // same as 'done'
+  }
+  __ bind(done);
+  // r0 = 0: obj == NULL or  obj is not an instanceof the specified klass
+  // r0 = 1: obj != NULL and obj is     an instanceof the specified klass
+}
+
+//-----------------------------------------------------------------------------
+// Breakpoints
+void TemplateTable::_breakpoint() {
+  // Note: We get here even if we are single stepping..
+  // jbug inists on setting breakpoints at every bytecode
+  // even if we are in single step mode.
+
+  transition(vtos, vtos);
+
+  // get the unpatched byte code
+  __ get_method(c_rarg1);
+  __ call_VM(noreg,
+             CAST_FROM_FN_PTR(address,
+                              InterpreterRuntime::get_original_bytecode_at),
+             c_rarg1, rbcp);
+  __ mov(r19, r0);
+
+  // post the breakpoint event
+  __ call_VM(noreg,
+             CAST_FROM_FN_PTR(address, InterpreterRuntime::_breakpoint),
+             rmethod, rbcp);
+
+  // complete the execution of original bytecode
+  __ mov(rscratch1, r19);
+  __ dispatch_only_normal(vtos);
+}
+
+//-----------------------------------------------------------------------------
+// Exceptions
+
+void TemplateTable::athrow() {
+  transition(atos, vtos);
+  __ null_check(r0);
+  __ b(Interpreter::throw_exception_entry());
+}
+
+//-----------------------------------------------------------------------------
+// Synchronization
+//
+// Note: monitorenter & exit are symmetric routines; which is reflected
+//       in the assembly code structure as well
+//
+// Stack layout:
+//
+// [expressions  ] <--- esp               = expression stack top
+// ..
+// [expressions  ]
+// [monitor entry] <--- monitor block top = expression stack bot
+// ..
+// [monitor entry]
+// [frame data   ] <--- monitor block bot
+// ...
+// [saved rbp    ] <--- rbp
+void TemplateTable::monitorenter()
+{
+  transition(atos, vtos);
+
+  // check for NULL object
+  __ null_check(r0);
+
+  const Address monitor_block_top(
+        rfp, frame::interpreter_frame_monitor_block_top_offset * wordSize);
+  const Address monitor_block_bot(
+        rfp, frame::interpreter_frame_initial_sp_offset * wordSize);
+  const int entry_size = frame::interpreter_frame_monitor_size() * wordSize;
+
+  Label allocated;
+
+  // initialize entry pointer
+  __ mov(c_rarg1, zr); // points to free slot or NULL
+
+  // find a free slot in the monitor block (result in c_rarg1)
+  {
+    Label entry, loop, exit;
+    __ ldr(c_rarg3, monitor_block_top); // points to current entry,
+                                        // starting with top-most entry
+    __ lea(c_rarg2, monitor_block_bot); // points to word before bottom
+
+    __ b(entry);
+
+    __ bind(loop);
+    // check if current entry is used
+    // if not used then remember entry in c_rarg1
+    __ ldr(rscratch1, Address(c_rarg3, BasicObjectLock::obj_offset_in_bytes()));
+    __ cmp(zr, rscratch1);
+    __ csel(c_rarg1, c_rarg3, c_rarg1, Assembler::EQ);
+    // check if current entry is for same object
+    __ cmp(r0, rscratch1);
+    // if same object then stop searching
+    __ br(Assembler::EQ, exit);
+    // otherwise advance to next entry
+    __ add(c_rarg3, c_rarg3, entry_size);
+    __ bind(entry);
+    // check if bottom reached
+    __ cmp(c_rarg3, c_rarg2);
+    // if not at bottom then check this entry
+    __ br(Assembler::NE, loop);
+    __ bind(exit);
+  }
+
+  __ cbnz(c_rarg1, allocated); // check if a slot has been found and
+                            // if found, continue with that on
+
+  // allocate one if there's no free slot
+  {
+    Label entry, loop, no_adjust;
+    // 1. compute new pointers            // rsp: old expression stack top
+    __ ldr(c_rarg1, monitor_block_bot);   // c_rarg1: old expression stack bottom
+    __ sub(esp, esp, entry_size);           // move expression stack top
+    __ sub(c_rarg1, c_rarg1, entry_size); // move expression stack bottom
+    __ mov(c_rarg3, esp);                 // set start value for copy loop
+    __ str(c_rarg1, monitor_block_bot);   // set new monitor block bottom
+
+    __ cmp(sp, c_rarg3);                  // Check if we need to move sp
+    __ br(Assembler::LO, no_adjust);      // to allow more stack space
+                                          // for our new esp
+    __ sub(sp, sp, 2 * wordSize);
+    __ bind(no_adjust);
+
+    __ b(entry);
+    // 2. move expression stack contents
+    __ bind(loop);
+    __ ldr(c_rarg2, Address(c_rarg3, entry_size)); // load expression stack
+                                                   // word from old location
+    __ str(c_rarg2, Address(c_rarg3, 0));          // and store it at new location
+    __ add(c_rarg3, c_rarg3, wordSize);            // advance to next word
+    __ bind(entry);
+    __ cmp(c_rarg3, c_rarg1);        // check if bottom reached
+    __ br(Assembler::NE, loop);      // if not at bottom then
+                                     // copy next word
+  }
+
+  // call run-time routine
+  // c_rarg1: points to monitor entry
+  __ bind(allocated);
+
+  // Increment bcp to point to the next bytecode, so exception
+  // handling for async. exceptions work correctly.
+  // The object has already been poped from the stack, so the
+  // expression stack looks correct.
+  __ increment(rbcp);
+
+  // store object
+  __ str(r0, Address(c_rarg1, BasicObjectLock::obj_offset_in_bytes()));
+  __ lock_object(c_rarg1);
+
+  // check to make sure this monitor doesn't cause stack overflow after locking
+  __ save_bcp();  // in case of exception
+  __ generate_stack_overflow_check(0);
+
+  // The bcp has already been incremented. Just need to dispatch to
+  // next instruction.
+  __ dispatch_next(vtos);
+}
+
+
+void TemplateTable::monitorexit()
+{
+  transition(atos, vtos);
+
+  // check for NULL object
+  __ null_check(r0);
+
+  const Address monitor_block_top(
+        rfp, frame::interpreter_frame_monitor_block_top_offset * wordSize);
+  const Address monitor_block_bot(
+        rfp, frame::interpreter_frame_initial_sp_offset * wordSize);
+  const int entry_size = frame::interpreter_frame_monitor_size() * wordSize;
+
+  Label found;
+
+  // find matching slot
+  {
+    Label entry, loop;
+    __ ldr(c_rarg1, monitor_block_top); // points to current entry,
+                                        // starting with top-most entry
+    __ lea(c_rarg2, monitor_block_bot); // points to word before bottom
+                                        // of monitor block
+    __ b(entry);
+
+    __ bind(loop);
+    // check if current entry is for same object
+    __ ldr(rscratch1, Address(c_rarg1, BasicObjectLock::obj_offset_in_bytes()));
+    __ cmp(r0, rscratch1);
+    // if same object then stop searching
+    __ br(Assembler::EQ, found);
+    // otherwise advance to next entry
+    __ add(c_rarg1, c_rarg1, entry_size);
+    __ bind(entry);
+    // check if bottom reached
+    __ cmp(c_rarg1, c_rarg2);
+    // if not at bottom then check this entry
+    __ br(Assembler::NE, loop);
+  }
+
+  // error handling. Unlocking was not block-structured
+  __ call_VM(noreg, CAST_FROM_FN_PTR(address,
+                   InterpreterRuntime::throw_illegal_monitor_state_exception));
+  __ should_not_reach_here();
+
+  // call run-time routine
+  __ bind(found);
+  __ push_ptr(r0); // make sure object is on stack (contract with oopMaps)
+  __ unlock_object(c_rarg1);
+  __ pop_ptr(r0); // discard object
+}
+
+
+// Wide instructions
+void TemplateTable::wide()
+{
+  __ load_unsigned_byte(r19, at_bcp(1));
+  __ mov(rscratch1, (address)Interpreter::_wentry_point);
+  __ ldr(rscratch1, Address(rscratch1, r19, Address::uxtw(3)));
+  __ br(rscratch1);
+}
+
+
+// Multi arrays
+void TemplateTable::multianewarray() {
+  transition(vtos, atos);
+  __ load_unsigned_byte(r0, at_bcp(3)); // get number of dimensions
+  // last dim is on top of stack; we want address of first one:
+  // first_addr = last_addr + (ndims - 1) * wordSize
+  __ lea(c_rarg1, Address(esp, r0, Address::uxtw(3)));
+  __ sub(c_rarg1, c_rarg1, wordSize);
+  call_VM(r0,
+          CAST_FROM_FN_PTR(address, InterpreterRuntime::multianewarray),
+          c_rarg1);
+  __ load_unsigned_byte(r1, at_bcp(3));
+  __ lea(esp, Address(esp, r1, Address::uxtw(3)));
+}
+#endif // !CC_INTERP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/templateTable_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_TEMPLATETABLE_AARCH64_64_HPP
+#define CPU_AARCH64_VM_TEMPLATETABLE_AARCH64_64_HPP
+
+static void prepare_invoke(int byte_no,
+                             Register method,         // linked method (or i-klass)
+                             Register index = noreg,  // itable index, MethodType, etc.
+                             Register recv  = noreg,  // if caller wants to see it
+                             Register flags = noreg   // if caller wants to test it
+                             );
+  static void invokevirtual_helper(Register index, Register recv,
+                                   Register flags);
+
+  // Helpers
+  static void index_check(Register array, Register index);
+  static void index_check_without_pop(Register array, Register index);
+
+#endif // CPU_AARCH64_VM_TEMPLATETABLE_AARCH64_64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/vmStructs_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_VMSTRUCTS_AARCH64_HPP
+#define CPU_AARCH64_VM_VMSTRUCTS_AARCH64_HPP
+
+// These are the CPU-specific fields, types and integer
+// constants required by the Serviceability Agent. This file is
+// referenced by vmStructs.cpp.
+
+#define VM_STRUCTS_CPU(nonstatic_field, static_field, unchecked_nonstatic_field, volatile_nonstatic_field, nonproduct_nonstatic_field, c2_nonstatic_field, unchecked_c1_static_field, unchecked_c2_static_field)            \
+                                                                                                                                     \
+  /******************************/                                                                                                   \
+  /* JavaCallWrapper            */                                                                                                   \
+  /******************************/                                                                                                   \
+  /******************************/                                                                                                   \
+  /* JavaFrameAnchor            */                                                                                                   \
+  /******************************/                                                                                                   \
+  volatile_nonstatic_field(JavaFrameAnchor,     _last_Java_fp,                                    intptr_t*)
+
+
+#define VM_TYPES_CPU(declare_type, declare_toplevel_type, declare_oop_type, declare_integer_type, declare_unsigned_integer_type, declare_c1_toplevel_type, declare_c2_type, declare_c2_toplevel_type)
+
+#define VM_INT_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant)
+
+#define VM_LONG_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant)
+
+#endif // CPU_AARCH64_VM_VMSTRUCTS_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "memory/resourceArea.hpp"
+#include "runtime/java.hpp"
+#include "runtime/stubCodeGenerator.hpp"
+#include "vm_version_aarch64.hpp"
+#ifdef TARGET_OS_FAMILY_linux
+# include "os_linux.inline.hpp"
+#endif
+
+#ifndef BUILTIN_SIM
+#include <sys/auxv.h>
+#include <asm/hwcap.h>
+#else
+#define getauxval(hwcap) 0
+#endif
+
+#ifndef HWCAP_AES
+#define HWCAP_AES   (1<<3)
+#endif
+
+#ifndef HWCAP_SHA1
+#define HWCAP_SHA1  (1<<5)
+#endif
+
+#ifndef HWCAP_SHA2
+#define HWCAP_SHA2  (1<<6)
+#endif
+
+#ifndef HWCAP_CRC32
+#define HWCAP_CRC32 (1<<7)
+#endif
+
+int VM_Version::_cpu;
+int VM_Version::_model;
+int VM_Version::_stepping;
+int VM_Version::_cpuFeatures;
+const char*           VM_Version::_features_str = "";
+
+static BufferBlob* stub_blob;
+static const int stub_size = 550;
+
+extern "C" {
+  typedef void (*getPsrInfo_stub_t)(void*);
+}
+static getPsrInfo_stub_t getPsrInfo_stub = NULL;
+
+
+class VM_Version_StubGenerator: public StubCodeGenerator {
+ public:
+
+  VM_Version_StubGenerator(CodeBuffer *c) : StubCodeGenerator(c) {}
+
+  address generate_getPsrInfo() {
+    StubCodeMark mark(this, "VM_Version", "getPsrInfo_stub");
+#   define __ _masm->
+    address start = __ pc();
+
+#ifdef BUILTIN_SIM
+    __ c_stub_prolog(1, 0, MacroAssembler::ret_type_void);
+#endif
+
+    // void getPsrInfo(VM_Version::CpuidInfo* cpuid_info);
+
+    address entry = __ pc();
+
+    // TODO : redefine fields in CpuidInfo and generate
+    // code to fill them in
+
+    __ ret(lr);
+
+#   undef __
+
+    return start;
+  }
+};
+
+
+void VM_Version::get_processor_features() {
+  _supports_cx8 = true;
+  _supports_atomic_getset4 = true;
+  _supports_atomic_getadd4 = true;
+  _supports_atomic_getset8 = true;
+  _supports_atomic_getadd8 = true;
+
+  if (FLAG_IS_DEFAULT(AllocatePrefetchDistance))
+    FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256);
+  if (FLAG_IS_DEFAULT(AllocatePrefetchStepSize))
+    FLAG_SET_DEFAULT(AllocatePrefetchStepSize, 64);
+  FLAG_SET_DEFAULT(PrefetchScanIntervalInBytes, 256);
+  FLAG_SET_DEFAULT(PrefetchFieldsAhead, 256);
+  FLAG_SET_DEFAULT(PrefetchCopyIntervalInBytes, 256);
+  FLAG_SET_DEFAULT(UseSSE42Intrinsics, true);
+
+  unsigned long auxv = getauxval(AT_HWCAP);
+
+  char buf[512];
+
+  strcpy(buf, "simd");
+  if (auxv & HWCAP_CRC32) strcat(buf, ", crc");
+  if (auxv & HWCAP_AES)   strcat(buf, ", aes");
+  if (auxv & HWCAP_SHA1)  strcat(buf, ", sha1");
+  if (auxv & HWCAP_SHA2)  strcat(buf, ", sha256");
+
+  _features_str = strdup(buf);
+
+  if (FLAG_IS_DEFAULT(UseCRC32)) {
+    UseCRC32 = (auxv & HWCAP_CRC32) != 0;
+  }
+  if (UseCRC32 && (auxv & HWCAP_CRC32) == 0) {
+    warning("UseCRC32 specified, but not supported on this CPU");
+  }
+  if (auxv & HWCAP_AES) {
+    UseAES = UseAES || FLAG_IS_DEFAULT(UseAES);
+    UseAESIntrinsics =
+        UseAESIntrinsics || (UseAES && FLAG_IS_DEFAULT(UseAESIntrinsics));
+    if (UseAESIntrinsics && !UseAES) {
+      warning("UseAESIntrinsics enabled, but UseAES not, enabling");
+      UseAES = true;
+    }
+  } else {
+    if (UseAES) {
+      warning("UseAES specified, but not supported on this CPU");
+    }
+    if (UseAESIntrinsics) {
+      warning("UseAESIntrinsics specified, but not supported on this CPU");
+    }
+  }
+
+  if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
+    UseCRC32Intrinsics = true;
+  }
+
+  if (auxv & (HWCAP_SHA1 | HWCAP_SHA2)) {
+    if (FLAG_IS_DEFAULT(UseSHA)) {
+      FLAG_SET_DEFAULT(UseSHA, true);
+    }
+  } else if (UseSHA) {
+    warning("SHA instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseSHA, false);
+  }
+
+  if (!UseSHA) {
+    FLAG_SET_DEFAULT(UseSHA1Intrinsics, false);
+    FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
+    FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
+  } else {
+    if (auxv & HWCAP_SHA1) {
+      if (FLAG_IS_DEFAULT(UseSHA1Intrinsics)) {
+        FLAG_SET_DEFAULT(UseSHA1Intrinsics, true);
+      }
+    } else if (UseSHA1Intrinsics) {
+      warning("SHA1 instruction is not available on this CPU.");
+      FLAG_SET_DEFAULT(UseSHA1Intrinsics, false);
+    }
+    if (auxv & HWCAP_SHA2) {
+      if (FLAG_IS_DEFAULT(UseSHA256Intrinsics)) {
+        FLAG_SET_DEFAULT(UseSHA256Intrinsics, true);
+      }
+    } else if (UseSHA256Intrinsics) {
+      warning("SHA256 instruction (for SHA-224 and SHA-256) is not available on this CPU.");
+      FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
+    }
+    if (UseSHA512Intrinsics) {
+      warning("SHA512 instruction (for SHA-384 and SHA-512) is not available on this CPU.");
+      FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
+    }
+  }
+
+#ifdef COMPILER2
+  if (FLAG_IS_DEFAULT(OptoScheduling)) {
+    OptoScheduling = true;
+  }
+#endif
+}
+
+void VM_Version::initialize() {
+  ResourceMark rm;
+
+  stub_blob = BufferBlob::create("getPsrInfo_stub", stub_size);
+  if (stub_blob == NULL) {
+    vm_exit_during_initialization("Unable to allocate getPsrInfo_stub");
+  }
+
+  CodeBuffer c(stub_blob);
+  VM_Version_StubGenerator g(&c);
+  getPsrInfo_stub = CAST_TO_FN_PTR(getPsrInfo_stub_t,
+                                   g.generate_getPsrInfo());
+
+  get_processor_features();
+}

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_VM_VERSION_AARCH64_HPP
+#define CPU_AARCH64_VM_VM_VERSION_AARCH64_HPP
+
+#include "runtime/globals_extension.hpp"
+#include "runtime/vm_version.hpp"
+
+class VM_Version : public Abstract_VM_Version {
+public:
+protected:
+  static int _cpu;
+  static int _model;
+  static int _stepping;
+  static int _cpuFeatures;     // features returned by the "cpuid" instruction
+                               // 0 if this instruction is not available
+  static const char* _features_str;
+
+  static void get_processor_features();
+
+public:
+  // Initialization
+  static void initialize();
+
+  // Asserts
+  static void assert_is_initialized() {
+  }
+
+  static const char* cpu_features()           { return _features_str; }
+
+};
+
+#endif // CPU_AARCH64_VM_VM_VERSION_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/vmreg_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "code/vmreg.hpp"
+
+
+
+void VMRegImpl::set_regName() {
+  Register reg = ::as_Register(0);
+  int i;
+  for (i = 0; i < ConcreteRegisterImpl::max_gpr ; ) {
+    regName[i++] = reg->name();
+    regName[i++] = reg->name();
+    reg = reg->successor();
+  }
+
+  FloatRegister freg = ::as_FloatRegister(0);
+  for ( ; i < ConcreteRegisterImpl::max_fpr ; ) {
+    regName[i++] = freg->name();
+    regName[i++] = freg->name();
+    freg = freg->successor();
+  }
+
+  for ( ; i < ConcreteRegisterImpl::number_of_registers ; i ++ ) {
+    regName[i] = "NON-GPR-FPR";
+  }
+}

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/vmreg_aarch64.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_VMREG_AARCH64_HPP
+#define CPU_AARCH64_VM_VMREG_AARCH64_HPP
+
+inline bool is_Register() {
+  return (unsigned int) value() < (unsigned int) ConcreteRegisterImpl::max_gpr;
+}
+
+inline bool is_FloatRegister() {
+  return value() >= ConcreteRegisterImpl::max_gpr && value() < ConcreteRegisterImpl::max_fpr;
+}
+
+inline Register as_Register() {
+
+  assert( is_Register(), "must be");
+  // Yuk
+  return ::as_Register(value() >> 1);
+}
+
+inline FloatRegister as_FloatRegister() {
+  assert( is_FloatRegister() && is_even(value()), "must be" );
+  // Yuk
+  return ::as_FloatRegister((value() - ConcreteRegisterImpl::max_gpr) >> 1);
+}
+
+inline   bool is_concrete() {
+  assert(is_reg(), "must be");
+  return is_even(value());
+}
+
+#endif // CPU_AARCH64_VM_VMREG_AARCH64_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/vmreg_aarch64.inline.hpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_VMREG_AARCH64_INLINE_HPP
+#define CPU_AARCH64_VM_VMREG_AARCH64_INLINE_HPP
+
+inline VMReg RegisterImpl::as_VMReg() {
+  if( this==noreg ) return VMRegImpl::Bad();
+  return VMRegImpl::as_VMReg(encoding() << 1 );
+}
+
+inline VMReg FloatRegisterImpl::as_VMReg() {
+  return VMRegImpl::as_VMReg((encoding() << 1) + ConcreteRegisterImpl::max_gpr);
+}
+
+#endif // CPU_AARCH64_VM_VMREG_AARCH64_INLINE_HPP

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/hotspot/src/cpu/aarch64/vm/vtableStubs_aarch64.cpp	Tue Jan 20 11:34:17 2015 -0800
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "assembler_aarch64.inline.hpp"
+#include "code/vtableStubs.hpp"
+#include "interp_masm_aarch64.hpp"
+#include "memory/resourceArea.hpp"
+#include "oops/instanceKlass.hpp"
+#include "oops/klassVtable.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "vmreg_aarch64.inline.hpp"
+#ifdef COMPILER2
+#include "opto/runtime.hpp"
+#endif
+
+// machine-dependent part of VtableStubs: create VtableStub of correct size and
+// initialize its code
+
+#define __ masm->
+
+#ifndef PRODUCT
+extern "C" void bad_compiled_vtable_index(JavaThread* thread,
+                                          oop receiver,
+                                          int index);
+#endif
+
+VtableStub* VtableStubs::create_vtable_stub(int vtable_index) {
+  const int aarch64_code_length = VtableStub::pd_code_size_limit(true);
+  VtableStub* s = new(aarch64_code_length) VtableStub(true, vtable_index);
+  ResourceMark rm;
+  CodeBuffer cb(s->entry_point(), aarch64_code_length);
+  MacroAssembler* masm = new MacroAssembler(&cb);
+
+#ifndef PRODUCT
+  if (CountCompiledCalls) {
+    __ lea(r19, ExternalAddress((address) SharedRuntime::nof_megamorphic_calls_addr()));
+    __ incrementw(Address(r19));
+  }
+#endif
+
+  // get receiver (need to skip return address on top of stack)
+  assert(VtableStub::receiver_location() == j_rarg0->as_VMReg(), "receiver expected in j_rarg0");
+
+  // get receiver klass
+  address npe_addr = __ pc();
+  __ load_klass(r19, j_rarg0);
+
+#ifndef PRODUCT
+  if (DebugVtables) {
+    Label L;
+    // check offset vs vtable length
+    __ ldrw(rscratch1, Address(r19, InstanceKlass::vtable_length_offset() * wordSize));
+    __ cmpw(rscratch1, vtable_index * vtableEntry::size());
+    __ br(Assembler::GT, L);
+    __ enter();
+    __ mov(r2, vtable_index);
+    __ call_VM(noreg,
+               CAST_FROM_FN_PTR(address, bad_compiled_vtable_index), j_rarg0, r2);
+    __ leave();
+    __ bind(L);
+  }
+#endif // PRODUCT
+
+  __ lookup_virtual_method(r19, vtable_index, rmethod);
+
+  if (DebugVtables) {
+    Label L;
+    __ cbz(rmethod, L);
+    __ ldr(rscratch1, Address(rmethod, Method::from_compiled_offset()));
+    __ cbnz(rscratch1, L);
+    __ stop("Vtable entry is NULL");
+    __ bind(L);
+  }
+  // r0: receiver klass
+  // rmethod: Method*
+  // r2: receiver
+  address ame_addr = __ pc();
+  __ ldr(rscratch1, Address(rmethod, Method::from_compiled_offset()));
+  __ br(rscratch1);
+
+  __ flush();
+
+  if (PrintMiscellaneous && (WizardMode || Verbose)) {
+    tty->print_cr("vtable #%d at "PTR_FORMAT"[%d] left over: %d",
+                  vtable_index, p2i(s->entry_point()),
+                  (int)(s->code_end() - s->entry_point()),
+                  (int)(s->code_end() - __ pc()));
+  }
+  guarantee(__ pc() <= s->code_end(), "overflowed buffer");
+
+  s->set_exception_points(npe_addr, ame_addr);
+  return s;
+}
+
+
+VtableStub* VtableStubs::create_itable_stub(int itable_index) {
+  // Note well: pd_code_size_limit is the absolute minimum we can get
+  // away with.  If you add code here, bump the code stub size
+  // returned by pd_code_size_limit!
+  const int code_length = VtableStub::pd_code_size_limit(false);
+  VtableStub* s = new(code_length) VtableStub(false, itable_index);
+  ResourceMark rm;
+  CodeBuffer cb(s->entry_point(), code_length);
+  MacroAssembler* masm = new MacroAssembler(&cb);
+
+#ifndef PRODUCT
+  if (CountCompiledCalls) {
+    __ lea(r10, ExternalAddress((address) SharedRuntime::nof_megamorphic_calls_addr()));
+    __ incrementw(Address(r10));
+  }
+#endif
+
+  // Entry arguments:
+  //  rscratch2: Interface
+  //  j_rarg0: Receiver
+
+  // Free registers (non-args) are r0 (interface), rmethod
+
+  // get receiver (need to skip return address on top of stack)
+
+  assert(VtableStub::receiver_location() == j_rarg0->as_VMReg(), "receiver expected in j_rarg0");
+  // get receiver klass (also an implicit null-check)
+  address npe_addr = __ pc();
+
+  // Most registers are in use; we'll use r0, rmethod, r10, r11
+  __ load_klass(r10, j_rarg0);
+
+  Label throw_icce;
+
+  // Get Method* and entrypoint for compiler
+  __ lookup_interface_method(// inputs: rec. class, interface, itable index
+                             r10, rscratch2, itable_index,
+                             // outputs: method, scan temp. reg
+                             rmethod, r11,
+                             throw_icce);
+
+  // method (rmethod): Method*
+  // j_rarg0: receiver
+
+#ifdef ASSERT
+  if (DebugVtables) {
+    Label L2;
+    __ cbz(rmethod, L2);
+    __ ldr(rscratch1, Address(rmethod, Method::from_compiled_offset()));
+    __ cbnz(rscratch1, L2);
+    __ stop("compiler entrypoint is null");
+    __ bind(L2);
+  }
+#endif // ASSERT
+
+  // rmethod: Method*
+  // j_rarg0: receiver
+  address ame_addr = __ pc();
+  __ ldr(rscratch1, Address(rmethod, Method::from_compiled_offset()));
+  __ br(rscratch1);
+
+  __ bind(throw_icce);
+  __ far_jump(RuntimeAddress(StubRoutines::throw_IncompatibleClassChangeError_entry()));
+
+  __ flush();
+
+  if (PrintMiscellaneous && (WizardMode || Verbose)) {
+    tty->print_cr("itable #%d at "PTR_FORMAT"[%d] left over: %d",
+                  itable_index, p2i(s->entry_point()),
+                  (int)(s->code_end() - s->entry_point()),
+                  (int)(s->code_end() - __ pc()));
+  }
+  guarantee(__ pc() <= s->code_end(), "overflowed buffer");
+
+  s->set_exception_points(npe_addr, ame_addr);
+  return s;
+}
+
+
+int VtableStub::pd_code_size_limit(bool is_vtable_stub) {
+  int size = DebugVtables ? 216 : 0;
+  if (CountCompiledCalls)
+    size += 6 * 4;
+  // FIXME
+  if (is_vtable_stub)
+    size += 52;
+  else
+    size += 104;
+  return size;
+
+  // In order to tune these parameters, run the JVM with VM options
+  // +PrintMiscellaneous and +WizardMode to see information about
+  // actual itable stubs.  Run it with -Xmx31G -XX:+UseCompressedOops.
+  //
+  // If Universe::narrow_klass_base is nonzero, decoding a compressed
+  // class can take zeveral instructions.  Run it with -Xmx31G
+  // -XX:+UseCompressedOops.
+  //
+  // The JVM98 app. _202_jess has a megamorphic interface call.
+  // The itable code looks like this:
+  // Decoding VtableStub itbl[1]@12
+  //     ldr     w10, [x1,#8]
+  //     lsl     x10, x10, #3
+  //     ldr     w11, [x10,#280]
+  //     add     x11, x10, x11, uxtx #3
+  //     add     x11, x11, #0x1b8
+  //     ldr     x12, [x11]
+  //     cmp     x9, x12
+  //     b.eq    success
+  // loop:
+  //     cbz     x12, throw_icce
+  //     add     x11, x11, #0x10
+  //     ldr     x12, [x11]
+  //     cmp     x9, x12
+  //     b.ne    loop
+  // success:
+  //     ldr     x11, [x11,#8]
+  //     ldr     x12, [x10,x11]
+  //     ldr     x8, [x12,#72]
+  //     br      x8
+  // throw_icce:
+  //     b      throw_ICCE_entry
+
+}
+
+int VtableStub::pd_code_alignment() { return 4; }

author	aph
	Tue, 20 Jan 2015 11:34:17 -0800
changeset 29183	0cc8699f7372
parent 29182	78af2a4d1ec3
child 29184	e234025cafb6

hotspot/src/cpu/aarch64/vm/aarch64Test.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/aarch64_call.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/aarch64_linkage.S		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/assembler_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/assembler_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/assembler_aarch64.inline.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/bytecodeInterpreter_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/bytecodeInterpreter_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/bytecodeInterpreter_aarch64.inline.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/bytecodes_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/bytecodes_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/bytes_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/codeBuffer_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/compiledIC_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/copy_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/cppInterpreterGenerator_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/cpustate_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/debug_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/decode_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/depChecker_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/depChecker_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/disassembler_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/frame_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/frame_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/frame_aarch64.inline.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/globalDefinitions_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/globals_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/icBuffer_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/icache_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/icache_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/immediate_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/immediate_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/interp_masm_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/interp_masm_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/interpreterGenerator_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/interpreterRT_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/interpreterRT_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/interpreter_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/interpreter_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/javaFrameAnchor_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/jniFastGetField_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/jniTypes_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/jni_aarch64.h		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.inline.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/metaspaceShared_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/methodHandles_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/methodHandles_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/nativeInst_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/nativeInst_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/registerMap_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/register_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/register_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/register_definitions_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/relocInfo_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/relocInfo_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/runtime_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/sharedRuntime_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/stubRoutines_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/templateInterpreterGenerator_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/templateInterpreter_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/templateInterpreter_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/templateTable_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/templateTable_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/vmStructs_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/vm_version_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/vmreg_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/vmreg_aarch64.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/vmreg_aarch64.inline.hpp		file \| annotate \| diff \| comparison \| revisions
hotspot/src/cpu/aarch64/vm/vtableStubs_aarch64.cpp		file \| annotate \| diff \| comparison \| revisions