hotspot/src/cpu/s390/vm/copy_s390.hpp
changeset 42065 6032b31e3719
equal deleted inserted replaced
42064:a530dbabe64f 42065:6032b31e3719
       
     1 /*
       
     2  * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
       
     3  * Copyright (c) 2016 SAP SE. All rights reserved.
       
     4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
       
     5  *
       
     6  * This code is free software; you can redistribute it and/or modify it
       
     7  * under the terms of the GNU General Public License version 2 only, as
       
     8  * published by the Free Software Foundation.
       
     9  *
       
    10  * This code is distributed in the hope that it will be useful, but WITHOUT
       
    11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
       
    12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
       
    13  * version 2 for more details (a copy is included in the LICENSE file that
       
    14  * accompanied this code).
       
    15  *
       
    16  * You should have received a copy of the GNU General Public License version
       
    17  * 2 along with this work; if not, write to the Free Software Foundation,
       
    18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
       
    19  *
       
    20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
       
    21  * or visit www.oracle.com if you need additional information or have any
       
    22  * questions.
       
    23  *
       
    24  */
       
    25 
       
    26 // Major contributions by LS
       
    27 
       
    28 #ifndef CPU_S390_VM_COPY_S390_HPP
       
    29 #define CPU_S390_VM_COPY_S390_HPP
       
    30 
       
    31 // Inline functions for memory copy and fill.
       
    32 
       
    33 // HeapWordSize (the size of class HeapWord) is 8 Bytes (the size of a
       
    34 // pointer variable), since we always run the _LP64 model. As a consequence,
       
    35 // HeapWord* memory ranges are always assumed to be doubleword-aligned,
       
    36 // having a size which is an integer multiple of HeapWordSize.
       
    37 //
       
    38 // Dealing only with doubleword-aligned doubleword units has important
       
    39 // positive performance and data access consequences. Many of the move
       
    40 // instructions perform particularly well under these circumstances.
       
    41 // Data access is "doubleword-concurrent", except for MVC and XC.
       
    42 // Furthermore, data access can be forced to be sequential (MVCL and MVCLE)
       
    43 // by use of the special padding byte 0xb1, where required. For copying,
       
    44 // we use padding byte 0xb0 to prevent the D-cache from being polluted.
       
    45 //
       
    46 // On z/Architecture, gcc optimizes memcpy into a series of MVC instructions.
       
    47 // This is optimal, even if just one HeapWord is copied. However, MVC
       
    48 // copying is not atomic, i.e. not "doubleword concurrent" by definition.
       
    49 //
       
    50 // If the -mmvcle compiler option is specified, memcpy translates into
       
    51 // code such that the entire memory range is copied or preset with just
       
    52 // one MVCLE instruction.
       
    53 //
       
    54 // *to = *from is transformed into a MVC instruction already with -O1.
       
    55 // Thus, for atomic copy operations, (inline) assembler code is required
       
    56 // to guarantee atomic data accesses.
       
    57 //
       
    58 // For large (len >= MVCLEThreshold) chunks of memory, we exploit
       
    59 // special H/W support of z/Architecture:
       
    60 // 1) copy short piece of memory to page-align address(es)
       
    61 // 2) copy largest part (all contained full pages) of memory using mvcle instruction.
       
    62 //    z/Architecture processors have special H/W support for page-aligned storage
       
    63 //    where len is an int multiple of page size. In that case, up to 4 cache lines are
       
    64 //    processed in parallel and L1 cache is not polluted.
       
    65 // 3) copy the remaining piece of memory.
       
    66 //
       
    67 //  Measurement classifications:
       
    68 //  very rare - <=     10.000 calls AND <=     1.000 usec elapsed
       
    69 //       rare - <=    100.000 calls AND <=    10.000 usec elapsed
       
    70 //       some - <=  1.000.000 calls AND <=   100.000 usec elapsed
       
    71 //       freq - <= 10.000.000 calls AND <= 1.000.000 usec elapsed
       
    72 //  very freq - >  10.000.000 calls OR  >  1.000.000 usec elapsed
       
    73 
       
    74 #undef USE_INLINE_ASM
       
    75 
       
    76 static void copy_conjoint_jshorts_atomic(jshort* from, jshort* to, size_t count) {
       
    77   if (from > to) {
       
    78     while (count-- > 0) {
       
    79       // Copy forwards
       
    80       *to++ = *from++;
       
    81     }
       
    82   } else {
       
    83     from += count - 1;
       
    84     to   += count - 1;
       
    85     while (count-- > 0) {
       
    86       // Copy backwards
       
    87       *to-- = *from--;
       
    88     }
       
    89   }
       
    90 }
       
    91 
       
    92 static void copy_conjoint_jints_atomic(jint* from, jint* to, size_t count) {
       
    93   if (from > to) {
       
    94     while (count-- > 0) {
       
    95       // Copy forwards
       
    96       *to++ = *from++;
       
    97     }
       
    98   } else {
       
    99     from += count - 1;
       
   100     to   += count - 1;
       
   101     while (count-- > 0) {
       
   102       // Copy backwards
       
   103       *to-- = *from--;
       
   104     }
       
   105   }
       
   106 }
       
   107 
       
   108 static bool has_destructive_overlap(char* from, char* to, size_t byte_count) {
       
   109   return (from < to) && ((to-from) < (ptrdiff_t)byte_count);
       
   110 }
       
   111 
       
   112 #ifdef USE_INLINE_ASM
       
   113 
       
   114   //--------------------------------------------------------------
       
   115   // Atomic copying. Atomicity is given by the minimum of source
       
   116   // and target alignment. Refer to mail comm with Tim Slegel/IBM.
       
   117   // Only usable for disjoint source and target.
       
   118   //--------------------------------------------------------------
       
   119   #define MOVE8_ATOMIC_4(_to,_from) {                            \
       
   120     unsigned long toaddr;                                        \
       
   121     unsigned long fromaddr;                                      \
       
   122     asm(                                                         \
       
   123       "LG      %[toaddr],%[to]     \n\t" /* address of to area   */ \
       
   124       "LG      %[fromaddr],%[from] \n\t" /* address of from area */ \
       
   125       "MVC     0(32,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \
       
   126       : [to]       "+Q"  (_to)          /* outputs   */          \
       
   127       , [from]     "+Q"  (_from)                                 \
       
   128       , [toaddr]   "=a"  (toaddr)                                \
       
   129       , [fromaddr] "=a"  (fromaddr)                              \
       
   130       :                                                          \
       
   131       : "cc"                            /* clobbered */          \
       
   132     );                                                           \
       
   133   }
       
   134   #define MOVE8_ATOMIC_3(_to,_from) {                            \
       
   135     unsigned long toaddr;                                        \
       
   136     unsigned long fromaddr;                                      \
       
   137     asm(                                                         \
       
   138       "LG      %[toaddr],%[to]     \n\t" /* address of to area   */ \
       
   139       "LG      %[fromaddr],%[from] \n\t" /* address of from area */ \
       
   140       "MVC     0(24,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \
       
   141       : [to]       "+Q"  (_to)          /* outputs   */          \
       
   142       , [from]     "+Q"  (_from)                                 \
       
   143       , [toaddr]   "=a"  (toaddr)                                \
       
   144       , [fromaddr] "=a"  (fromaddr)                              \
       
   145       :                                                          \
       
   146       : "cc"                            /* clobbered */          \
       
   147     );                                                           \
       
   148   }
       
   149   #define MOVE8_ATOMIC_2(_to,_from) {                            \
       
   150     unsigned long toaddr;                                        \
       
   151     unsigned long fromaddr;                                      \
       
   152     asm(                                                         \
       
   153       "LG      %[toaddr],%[to]     \n\t" /* address of to area   */ \
       
   154       "LG      %[fromaddr],%[from] \n\t" /* address of from area */ \
       
   155       "MVC     0(16,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \
       
   156       : [to]       "+Q"  (_to)          /* outputs   */          \
       
   157       , [from]     "+Q"  (_from)                                 \
       
   158       , [toaddr]   "=a"  (toaddr)                                \
       
   159       , [fromaddr] "=a"  (fromaddr)                              \
       
   160       :                                                          \
       
   161       : "cc"                            /* clobbered */          \
       
   162     );                                                           \
       
   163   }
       
   164   #define MOVE8_ATOMIC_1(_to,_from) {                            \
       
   165     unsigned long toaddr;                                        \
       
   166     unsigned long fromaddr;                                      \
       
   167     asm(                                                         \
       
   168       "LG      %[toaddr],%[to]     \n\t" /* address of to area   */ \
       
   169       "LG      %[fromaddr],%[from] \n\t" /* address of from area */ \
       
   170       "MVC     0(8,%[toaddr]),0(%[fromaddr]) \n\t"  /* move data */ \
       
   171       : [to]       "+Q"  (_to)          /* outputs   */          \
       
   172       , [from]     "+Q"  (_from)                                 \
       
   173       , [toaddr]   "=a"  (toaddr)                                \
       
   174       , [fromaddr] "=a"  (fromaddr)                              \
       
   175       :                                                          \
       
   176       : "cc"                            /* clobbered */          \
       
   177     );                                                           \
       
   178   }
       
   179 
       
   180   //--------------------------------------------------------------
       
   181   // Atomic copying of 8-byte entities.
       
   182   // Conjoint/disjoint property does not matter. Entities are first
       
   183   // loaded and then stored.
       
   184   // _to and _from must be 8-byte aligned.
       
   185   //--------------------------------------------------------------
       
   186   #define COPY8_ATOMIC_4(_to,_from) {                            \
       
   187     unsigned long toaddr;                                        \
       
   188     asm(                                                         \
       
   189       "LG      3,%[from]        \n\t" /* address of from area */ \
       
   190       "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
       
   191       "LMG     0,3,0(3)         \n\t" /* load data            */ \
       
   192       "STMG    0,3,0(%[toaddr]) \n\t" /* store data           */ \
       
   193       : [to]     "+Q"  (_to)          /* outputs   */            \
       
   194       , [from]   "+Q"  (_from)        /* outputs   */            \
       
   195       , [toaddr] "=a"  (toaddr)       /* inputs    */            \
       
   196       :                                                          \
       
   197       : "cc",  "r0", "r1", "r2", "r3" /* clobbered */            \
       
   198     );                                                           \
       
   199   }
       
   200   #define COPY8_ATOMIC_3(_to,_from) {                            \
       
   201     unsigned long toaddr;                                        \
       
   202     asm(                                                         \
       
   203       "LG      2,%[from]        \n\t" /* address of from area */ \
       
   204       "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
       
   205       "LMG     0,2,0(2)         \n\t" /* load data            */ \
       
   206       "STMG    0,2,0(%[toaddr]) \n\t" /* store data           */ \
       
   207       : [to]     "+Q"  (_to)          /* outputs   */            \
       
   208       , [from]   "+Q"  (_from)        /* outputs   */            \
       
   209       , [toaddr] "=a"  (toaddr)       /* inputs    */            \
       
   210       :                                                          \
       
   211       : "cc",  "r0", "r1", "r2"       /* clobbered */            \
       
   212     );                                                           \
       
   213   }
       
   214   #define COPY8_ATOMIC_2(_to,_from) {                            \
       
   215     unsigned long toaddr;                                        \
       
   216     asm(                                                         \
       
   217       "LG      1,%[from]        \n\t" /* address of from area */ \
       
   218       "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
       
   219       "LMG     0,1,0(1)         \n\t" /* load data            */ \
       
   220       "STMG    0,1,0(%[toaddr]) \n\t" /* store data           */ \
       
   221       : [to]     "+Q"  (_to)          /* outputs   */            \
       
   222       , [from]   "+Q"  (_from)        /* outputs   */            \
       
   223       , [toaddr] "=a"  (toaddr)       /* inputs    */            \
       
   224       :                                                          \
       
   225       : "cc",  "r0", "r1"             /* clobbered */            \
       
   226     );                                                           \
       
   227   }
       
   228   #define COPY8_ATOMIC_1(_to,_from) {                            \
       
   229     unsigned long addr;                                          \
       
   230     asm(                                                         \
       
   231       "LG      %[addr],%[from]  \n\t" /* address of from area */ \
       
   232       "LG      0,0(0,%[addr])   \n\t" /* load data            */ \
       
   233       "LG      %[addr],%[to]    \n\t" /* address of to area   */ \
       
   234       "STG     0,0(0,%[addr])   \n\t" /* store data           */ \
       
   235       : [to]     "+Q"  (_to)          /* outputs   */            \
       
   236       , [from]   "+Q"  (_from)        /* outputs   */            \
       
   237       , [addr]   "=a"  (addr)         /* inputs    */            \
       
   238       :                                                          \
       
   239       : "cc",  "r0"                   /* clobbered */            \
       
   240     );                                                           \
       
   241   }
       
   242 
       
   243   //--------------------------------------------------------------
       
   244   // Atomic copying of 4-byte entities.
       
   245   // Exactly 4 (four) entities are copied.
       
   246   // Conjoint/disjoint property does not matter. Entities are first
       
   247   // loaded and then stored.
       
   248   // _to and _from must be 4-byte aligned.
       
   249   //--------------------------------------------------------------
       
   250   #define COPY4_ATOMIC_4(_to,_from) {                            \
       
   251     unsigned long toaddr;                                        \
       
   252     asm(                                                         \
       
   253       "LG      3,%[from]        \n\t" /* address of from area */ \
       
   254       "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
       
   255       "LM      0,3,0(3)         \n\t" /* load data            */ \
       
   256       "STM     0,3,0(%[toaddr]) \n\t" /* store data           */ \
       
   257       : [to]     "+Q"  (_to)          /* outputs   */            \
       
   258       , [from]   "+Q"  (_from)        /* outputs   */            \
       
   259       , [toaddr] "=a"  (toaddr)       /* inputs    */            \
       
   260       :                                                          \
       
   261       : "cc",  "r0", "r1", "r2", "r3" /* clobbered */            \
       
   262     );                                                           \
       
   263   }
       
   264   #define COPY4_ATOMIC_3(_to,_from) {                            \
       
   265     unsigned long toaddr;                                        \
       
   266     asm(                                                         \
       
   267       "LG      2,%[from]        \n\t" /* address of from area */ \
       
   268       "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
       
   269       "LM      0,2,0(2)         \n\t" /* load data            */ \
       
   270       "STM     0,2,0(%[toaddr]) \n\t" /* store data           */ \
       
   271       : [to]     "+Q"  (_to)          /* outputs   */            \
       
   272       , [from]   "+Q"  (_from)        /* outputs   */            \
       
   273       , [toaddr] "=a"  (toaddr)       /* inputs    */            \
       
   274       :                                                          \
       
   275       : "cc",  "r0", "r1", "r2"       /* clobbered */            \
       
   276     );                                                           \
       
   277   }
       
   278   #define COPY4_ATOMIC_2(_to,_from) {                            \
       
   279     unsigned long toaddr;                                        \
       
   280     asm(                                                         \
       
   281       "LG      1,%[from]        \n\t" /* address of from area */ \
       
   282       "LG      %[toaddr],%[to]  \n\t" /* address of to area   */ \
       
   283       "LM      0,1,0(1)         \n\t" /* load data            */ \
       
   284       "STM     0,1,0(%[toaddr]) \n\t" /* store data           */ \
       
   285       : [to]     "+Q"  (_to)          /* outputs   */            \
       
   286       , [from]   "+Q"  (_from)        /* outputs   */            \
       
   287       , [toaddr] "=a"  (toaddr)       /* inputs    */            \
       
   288       :                                                          \
       
   289       : "cc",  "r0", "r1"             /* clobbered */            \
       
   290     );                                                           \
       
   291   }
       
   292   #define COPY4_ATOMIC_1(_to,_from) {                            \
       
   293     unsigned long addr;                                          \
       
   294     asm(                                                         \
       
   295       "LG      %[addr],%[from]  \n\t" /* address of from area */ \
       
   296       "L       0,0(0,%[addr])   \n\t" /* load data            */ \
       
   297       "LG      %[addr],%[to]    \n\t" /* address of to area   */ \
       
   298       "ST      0,0(0,%[addr])   \n\t" /* store data           */ \
       
   299       : [to]     "+Q"  (_to)          /* outputs   */            \
       
   300       , [from]   "+Q"  (_from)        /* outputs   */            \
       
   301       , [addr]   "=a"  (addr)         /* inputs    */            \
       
   302       :                                                          \
       
   303       : "cc",  "r0"                   /* clobbered */            \
       
   304     );                                                           \
       
   305   }
       
   306 
       
   307 #if 0  // Waiting for gcc to support EXRL.
       
   308   #define MVC_MEMCOPY(_to,_from,_len)                                \
       
   309     if (VM_Version::has_ExecuteExtensions()) {                       \
       
   310       asm("\t"                                                       \
       
   311       "    LAY     1,-1(0,%[len])      \n\t" /* decr for MVC  */     \
       
   312       "    EXRL    1,1f                \n\t" /* execute MVC instr */ \
       
   313       "    BRC     15,2f               \n\t" /* skip template */     \
       
   314       "1:  MVC     0(%[len],%[to]),0(%[from]) \n\t"                  \
       
   315       "2:  BCR     0,0                 \n\t"                         \
       
   316       : [to]   "+Q"  (_to)             /* outputs   */               \
       
   317       , [from] "+Q"  (_from)           /* outputs   */               \
       
   318       : [len]  "r"   (_len)            /* inputs    */               \
       
   319       : "cc",  "r1"                    /* clobbered */               \
       
   320       );                                                             \
       
   321     } else {                                                         \
       
   322       asm("\t"                                                       \
       
   323       "    LARL    2,3f                \n\t"                         \
       
   324       "    LAY     1,-1(0,%[len])      \n\t" /* decr for MVC  */     \
       
   325       "    EX      1,0(2)              \n\t" /* execute MVC instr */ \
       
   326       "    BRC     15,4f               \n\t" /* skip template */     \
       
   327       "3:  MVC     0(%[len],%[to]),0(%[from])  \n\t"                 \
       
   328       "4:  BCR     0,0                 \n\t"                         \
       
   329       : [to]   "+Q"  (_to)             /* outputs   */               \
       
   330       , [from] "+Q"  (_from)           /* outputs   */               \
       
   331       : [len]  "r"   (_len)            /* inputs    */               \
       
   332       : "cc",  "r1", "r2"              /* clobbered */               \
       
   333       );                                                             \
       
   334     }
       
   335 #else
       
   336   #define MVC_MEMCOPY(_to,_from,_len)                                \
       
   337   { unsigned long toaddr;   unsigned long tolen;                     \
       
   338     unsigned long fromaddr; unsigned long target;                    \
       
   339       asm("\t"                                                       \
       
   340       "    LTGR    %[tolen],%[len]     \n\t" /* decr for MVC  */     \
       
   341       "    BRC     8,2f                \n\t" /* do nothing for l=0*/ \
       
   342       "    AGHI    %[tolen],-1         \n\t"                         \
       
   343       "    LG      %[toaddr],%[to]     \n\t"                         \
       
   344       "    LG      %[fromaddr],%[from] \n\t"                         \
       
   345       "    LARL    %[target],1f        \n\t" /* addr of MVC instr */ \
       
   346       "    EX      %[tolen],0(%[target])         \n\t" /* execute MVC instr */ \
       
   347       "    BRC     15,2f                         \n\t" /* skip template */     \
       
   348       "1:  MVC     0(1,%[toaddr]),0(%[fromaddr]) \n\t"                         \
       
   349       "2:  BCR     0,0                 \n\t" /* nop a branch target*/\
       
   350       : [to]       "+Q"  (_to)         /* outputs   */               \
       
   351       , [from]     "+Q"  (_from)                                     \
       
   352       , [tolen]    "=a"  (tolen)                                     \
       
   353       , [toaddr]   "=a"  (toaddr)                                    \
       
   354       , [fromaddr] "=a"  (fromaddr)                                  \
       
   355       , [target]   "=a"  (target)                                    \
       
   356       : [len]       "r"  (_len)        /* inputs    */               \
       
   357       : "cc"                           /* clobbered */               \
       
   358       );                                                             \
       
   359   }
       
   360 #endif
       
   361 
       
   362   #if 0  // code snippet to be used for debugging
       
   363       /* ASSERT code BEGIN */                                                \
       
   364       "    LARL    %[len],5f       \n\t"                                     \
       
   365       "    LARL    %[mta],4f       \n\t"                                     \
       
   366       "    SLGR    %[len],%[mta]   \n\t"                                     \
       
   367       "    CGHI    %[len],16       \n\t"                                     \
       
   368       "    BRC     7,9f            \n\t"      /* block size !=  16 */        \
       
   369                                                                              \
       
   370       "    LARL    %[len],1f       \n\t"                                     \
       
   371       "    SLGR    %[len],%[mta]   \n\t"                                     \
       
   372       "    CGHI    %[len],256      \n\t"                                     \
       
   373       "    BRC     7,9f            \n\t"      /* list len   != 256 */        \
       
   374                                                                              \
       
   375       "    LGR     0,0             \n\t"      /* artificial SIGILL */        \
       
   376       "9:  BRC     7,-2            \n\t"                                     \
       
   377       "    LARL    %[mta],1f       \n\t"      /* restore MVC table begin */  \
       
   378       /* ASSERT code END   */
       
   379   #endif
       
   380 
       
   381   // Optimized copying for data less than 4k
       
   382   // - no destructive overlap
       
   383   // - 0 <= _n_bytes <= 4096
       
   384   // This macro needs to be gcc-compiled with -march=z990. Otherwise, the
       
   385   // LAY instruction is not available.
       
   386   #define MVC_MULTI(_to,_from,_n_bytes)                                      \
       
   387   { unsigned long toaddr;                                                    \
       
   388     unsigned long fromaddr;                                                  \
       
   389     unsigned long movetable;                                                 \
       
   390     unsigned long len;                                                       \
       
   391       asm("\t"                                                               \
       
   392       "    LTGFR   %[len],%[nby]   \n\t"                                     \
       
   393       "    LG      %[ta],%[to]     \n\t"      /* address of to area   */     \
       
   394       "    BRC     8,1f            \n\t"      /* nothing to copy   */        \
       
   395                                                                              \
       
   396       "    NILL    %[nby],255      \n\t"      /* # bytes mod 256      */     \
       
   397       "    LG      %[fa],%[from]   \n\t"      /* address of from area */     \
       
   398       "    BRC     8,3f            \n\t"      /* no rest, skip copying */    \
       
   399                                                                              \
       
   400       "    LARL    %[mta],2f       \n\t"      /* MVC template addr */        \
       
   401       "    AHI     %[nby],-1       \n\t"      /* adjust for EX MVC  */       \
       
   402                                                                              \
       
   403       "    EX      %[nby],0(%[mta]) \n\t"     /* only rightmost */           \
       
   404                                               /* 8 bits of nby used */       \
       
   405       /* Since nby is <= 4096 on entry to this code, we do need */           \
       
   406       /* no zero extension before using it in addr calc.        */           \
       
   407       "    LA      %[fa],1(%[nby],%[fa]) \n\t"/* adjust from addr */         \
       
   408       "    LA      %[ta],1(%[nby],%[ta]) \n\t"/* adjust to   addr */         \
       
   409                                                                              \
       
   410       "3:  SRAG    %[nby],%[len],8 \n\t"      /* # cache lines     */        \
       
   411       "    LARL    %[mta],1f       \n\t"      /* MVC table begin   */        \
       
   412       "    BRC     8,1f            \n\t"      /* nothing to copy   */        \
       
   413                                                                              \
       
   414       /* Insert ASSERT code here if required. */                             \
       
   415                                                                              \
       
   416                                                                              \
       
   417       "    LNGFR   %[nby],%[nby]   \n\t"      /* negative offset into     */ \
       
   418       "    SLLG    %[nby],%[nby],4 \n\t"      /* MVC table 16-byte blocks */ \
       
   419       "    BC      15,0(%[nby],%[mta]) \n\t"  /* branch to block #ncl  */    \
       
   420                                                                              \
       
   421       "2:  MVC     0(1,%[ta]),0(%[fa]) \n\t"  /* MVC template */             \
       
   422                                                                              \
       
   423       "4:  MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 4096 == l        */      \
       
   424       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
       
   425       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
       
   426       "5:  MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 3840 <= l < 4096 */      \
       
   427       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
       
   428       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
       
   429       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 3548 <= l < 3328 */      \
       
   430       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
       
   431       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
       
   432       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 3328 <= l < 3328 */      \
       
   433       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
       
   434       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
       
   435       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 3072 <= l < 3328 */      \
       
   436       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
       
   437       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
       
   438       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 2816 <= l < 3072 */      \
       
   439       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
       
   440       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
       
   441       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 2560 <= l < 2816 */      \
       
   442       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
       
   443       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
       
   444       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 2304 <= l < 2560 */      \
       
   445       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
       
   446       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
       
   447       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 2048 <= l < 2304 */      \
       
   448       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
       
   449       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
       
   450       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 1792 <= l < 2048 */      \
       
   451       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
       
   452       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
       
   453       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 1536 <= l < 1792 */      \
       
   454       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
       
   455       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
       
   456       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 1280 <= l < 1536 */      \
       
   457       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
       
   458       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
       
   459       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /* 1024 <= l < 1280 */      \
       
   460       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
       
   461       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
       
   462       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /*  768 <= l < 1024 */      \
       
   463       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
       
   464       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
       
   465       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /*  512 <= l <  768 */      \
       
   466       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
       
   467       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
       
   468       "    MVC     0(256,%[ta]),0(%[fa])   \n\t" /*  256 <= l <  512 */      \
       
   469       "    LAY     %[ta],256(0,%[ta])      \n\t"                             \
       
   470       "    LA      %[fa],256(0,%[fa])      \n\t"                             \
       
   471       "1:  BCR     0,0                     \n\t" /* nop as branch target */  \
       
   472       : [to]       "+Q"  (_to)          /* outputs   */          \
       
   473       , [from]     "+Q"  (_from)                                 \
       
   474       , [ta]       "=a"  (toaddr)                                \
       
   475       , [fa]       "=a"  (fromaddr)                              \
       
   476       , [mta]      "=a"  (movetable)                             \
       
   477       , [nby]      "+a"  (_n_bytes)                              \
       
   478       , [len]      "=a"  (len)                                   \
       
   479       :                                                          \
       
   480       : "cc"                            /* clobbered */          \
       
   481     );                                                           \
       
   482   }
       
   483 
       
   484   #define MVCLE_MEMCOPY(_to,_from,_len)                           \
       
   485     asm(                                                          \
       
   486       "    LG      0,%[to]     \n\t"   /* address of to area   */ \
       
   487       "    LG      2,%[from]   \n\t"   /* address of from area */ \
       
   488       "    LGR     1,%[len]    \n\t"   /* len of to area       */ \
       
   489       "    LGR     3,%[len]    \n\t"   /* len of from area     */ \
       
   490       "1:  MVCLE   0,2,176     \n\t"   /* copy storage, bypass cache (0xb0) */ \
       
   491       "    BRC     1,1b        \n\t"   /* retry if interrupted */ \
       
   492       : [to]   "+Q"  (_to)             /* outputs   */            \
       
   493       , [from] "+Q"  (_from)           /* outputs   */            \
       
   494       : [len]  "r"   (_len)            /* inputs    */            \
       
   495       : "cc",  "r0", "r1", "r2", "r3"  /* clobbered */            \
       
   496     );
       
   497 
       
   498   #define MVCLE_MEMINIT(_to,_val,_len)                            \
       
   499     asm(                                                          \
       
   500       "    LG      0,%[to]       \n\t" /* address of to area   */ \
       
   501       "    LGR     1,%[len]      \n\t" /* len of to area       */ \
       
   502       "    XGR     3,3           \n\t" /* from area len = 0    */ \
       
   503       "1:  MVCLE   0,2,0(%[val]) \n\t" /* init storage         */ \
       
   504       "    BRC     1,1b          \n\t" /* retry if interrupted */ \
       
   505       : [to]   "+Q"  (_to)             /* outputs   */            \
       
   506       : [len]  "r"   (_len)            /* inputs    */            \
       
   507       , [val]  "r"   (_val)            /* inputs    */            \
       
   508       : "cc",  "r0", "r1", "r3"        /* clobbered */            \
       
   509     );
       
   510   #define MVCLE_MEMZERO(_to,_len)                                 \
       
   511     asm(                                                          \
       
   512       "    LG      0,%[to]       \n\t" /* address of to area   */ \
       
   513       "    LGR     1,%[len]      \n\t" /* len of to area       */ \
       
   514       "    XGR     3,3           \n\t" /* from area len = 0    */ \
       
   515       "1:  MVCLE   0,2,0         \n\t" /* clear storage        */ \
       
   516       "    BRC     1,1b          \n\t" /* retry if interrupted */ \
       
   517       : [to]   "+Q"  (_to)             /* outputs   */            \
       
   518       : [len]  "r"   (_len)            /* inputs    */            \
       
   519       : "cc",  "r0", "r1", "r3"        /* clobbered */            \
       
   520     );
       
   521 
       
   522   // Clear a stretch of memory, 0 <= _len <= 256.
       
   523   // There is no alignment prereq.
       
   524   // There is no test for len out of range specified above.
       
   525   #define XC_MEMZERO_256(_to,_len)                                 \
       
   526 { unsigned long toaddr;   unsigned long tolen;                     \
       
   527   unsigned long target;                                            \
       
   528     asm("\t"                                                       \
       
   529     "    LTGR    %[tolen],%[len]     \n\t" /* decr for MVC  */     \
       
   530     "    BRC     8,2f                \n\t" /* do nothing for l=0*/ \
       
   531     "    AGHI    %[tolen],-1         \n\t" /* adjust for EX XC  */ \
       
   532     "    LARL    %[target],1f        \n\t" /* addr of XC instr  */ \
       
   533     "    LG      %[toaddr],%[to]     \n\t" /* addr of data area */ \
       
   534     "    EX      %[tolen],0(%[target])       \n\t" /* execute MVC instr */ \
       
   535     "    BRC     15,2f                       \n\t" /* skip template */     \
       
   536     "1:  XC      0(1,%[toaddr]),0(%[toaddr]) \n\t"                         \
       
   537     "2:  BCR     0,0                 \n\t" /* nop a branch target*/\
       
   538     : [to]       "+Q"  (_to)         /* outputs   */               \
       
   539     , [tolen]    "=a"  (tolen)                                     \
       
   540     , [toaddr]   "=a"  (toaddr)                                    \
       
   541     , [target]   "=a"  (target)                                    \
       
   542     : [len]       "r"  (_len)        /* inputs    */               \
       
   543     : "cc"                           /* clobbered */               \
       
   544     );                                                             \
       
   545 }
       
   546 
       
   547   // Clear a stretch of memory, 256 < _len.
       
   548   // XC_MEMZERO_256 may be used to clear shorter areas.
       
   549   //
       
   550   // The code
       
   551   // - first zeroes a few bytes to align on a HeapWord.
       
   552   //   This step is currently inactive because all calls seem
       
   553   //   to have their data aligned on HeapWord boundaries.
       
   554   // - then zeroes a few HeapWords to align on a cache line.
       
   555   // - then zeroes entire cache lines in a loop.
       
   556   // - then zeroes the remaining (partial) cache line.
       
   557 #if 1
       
   558   #define XC_MEMZERO_ANY(_to,_len)                                    \
       
   559 { unsigned long toaddr;   unsigned long tolen;                        \
       
   560   unsigned long len8;     unsigned long len256;                       \
       
   561   unsigned long target;   unsigned long lenx;                         \
       
   562     asm("\t"                                                          \
       
   563     "    LTGR    %[tolen],%[len]      \n\t" /*                   */   \
       
   564     "    BRC     8,2f                 \n\t" /* do nothing for l=0*/   \
       
   565     "    LG      %[toaddr],%[to]      \n\t" /* addr of data area */   \
       
   566     "    LARL    %[target],1f         \n\t" /* addr of XC instr  */   \
       
   567     " "                                                               \
       
   568     "    LCGR    %[len256],%[toaddr]  \n\t" /* cache line alignment */\
       
   569     "    NILL    %[len256],0xff       \n\t"                           \
       
   570     "    BRC     8,4f                 \n\t" /* already aligned     */ \
       
   571     "    NILH    %[len256],0x00       \n\t" /* zero extend         */ \
       
   572     "    LLGFR   %[len256],%[len256]  \n\t"                           \
       
   573     "    LAY     %[lenx],-1(,%[len256]) \n\t"                         \
       
   574     "    EX      %[lenx],0(%[target]) \n\t" /* execute MVC instr   */ \
       
   575     "    LA      %[toaddr],0(%[len256],%[toaddr]) \n\t"               \
       
   576     "    SGR     %[tolen],%[len256]   \n\t" /* adjust len          */ \
       
   577     " "                                                               \
       
   578     "4:  SRAG    %[lenx],%[tolen],8   \n\t" /* # cache lines       */ \
       
   579     "    BRC     8,6f                 \n\t" /* no full cache lines */ \
       
   580     "5:  XC      0(256,%[toaddr]),0(%[toaddr]) \n\t"                  \
       
   581     "    LA      %[toaddr],256(,%[toaddr]) \n\t"                      \
       
   582     "    BRCTG   %[lenx],5b           \n\t" /* iterate             */ \
       
   583     " "                                                               \
       
   584     "6:  NILL    %[tolen],0xff        \n\t" /* leftover bytes      */ \
       
   585     "    BRC     8,2f                 \n\t" /* done if none        */ \
       
   586     "    LAY     %[lenx],-1(,%[tolen]) \n\t"                          \
       
   587     "    EX      %[lenx],0(%[target]) \n\t" /* execute MVC instr   */ \
       
   588     "    BRC     15,2f                \n\t" /* skip template       */ \
       
   589     " "                                                               \
       
   590     "1:  XC      0(1,%[toaddr]),0(%[toaddr]) \n\t"                    \
       
   591     "2:  BCR     0,0                  \n\t" /* nop a branch target */ \
       
   592     : [to]       "+Q"  (_to)         /* outputs   */               \
       
   593     , [lenx]     "=a"  (lenx)                                      \
       
   594     , [len256]   "=a"  (len256)                                    \
       
   595     , [tolen]    "=a"  (tolen)                                     \
       
   596     , [toaddr]   "=a"  (toaddr)                                    \
       
   597     , [target]   "=a"  (target)                                    \
       
   598     : [len]       "r"  (_len)        /* inputs    */               \
       
   599     : "cc"                           /* clobbered */               \
       
   600     );                                                             \
       
   601 }
       
   602 #else
       
   603   #define XC_MEMZERO_ANY(_to,_len)                                    \
       
   604 { unsigned long toaddr;   unsigned long tolen;                        \
       
   605   unsigned long len8;     unsigned long len256;                       \
       
   606   unsigned long target;   unsigned long lenx;                         \
       
   607     asm("\t"                                                          \
       
   608     "    LTGR    %[tolen],%[len]      \n\t" /*                   */   \
       
   609     "    BRC     8,2f                 \n\t" /* do nothing for l=0*/   \
       
   610     "    LG      %[toaddr],%[to]      \n\t" /* addr of data area */   \
       
   611     "    LARL    %[target],1f         \n\t" /* addr of XC instr  */   \
       
   612     " "                                                               \
       
   613     "    LCGR    %[len8],%[toaddr]    \n\t" /* HeapWord alignment  */ \
       
   614     "    NILL    %[len8],0x07         \n\t"                           \
       
   615     "    BRC     8,3f                 \n\t" /* already aligned     */ \
       
   616     "    NILH    %[len8],0x00         \n\t" /* zero extend         */ \
       
   617     "    LLGFR   %[len8],%[len8]      \n\t"                           \
       
   618     "    LAY     %[lenx],-1(,%[len8]) \n\t"                           \
       
   619     "    EX      %[lenx],0(%[target]) \n\t" /* execute MVC instr */   \
       
   620     "    LA      %[toaddr],0(%[len8],%[toaddr]) \n\t"                 \
       
   621     "    SGR     %[tolen],%[len8]     \n\t" /* adjust len          */ \
       
   622     " "                                                               \
       
   623     "3:  LCGR    %[len256],%[toaddr]  \n\t" /* cache line alignment */\
       
   624     "    NILL    %[len256],0xff       \n\t"                           \
       
   625     "    BRC     8,4f                 \n\t" /* already aligned     */ \
       
   626     "    NILH    %[len256],0x00       \n\t" /* zero extend         */ \
       
   627     "    LLGFR   %[len256],%[len256]  \n\t"                           \
       
   628     "    LAY     %[lenx],-1(,%[len256]) \n\t"                         \
       
   629     "    EX      %[lenx],0(%[target]) \n\t" /* execute MVC instr   */ \
       
   630     "    LA      %[toaddr],0(%[len256],%[toaddr]) \n\t"               \
       
   631     "    SGR     %[tolen],%[len256]   \n\t" /* adjust len          */ \
       
   632     " "                                                               \
       
   633     "4:  SRAG    %[lenx],%[tolen],8   \n\t" /* # cache lines       */ \
       
   634     "    BRC     8,6f                 \n\t" /* no full cache lines */ \
       
   635     "5:  XC      0(256,%[toaddr]),0(%[toaddr]) \n\t"                  \
       
   636     "    LA      %[toaddr],256(,%[toaddr]) \n\t"                      \
       
   637     "    BRCTG   %[lenx],5b           \n\t" /* iterate             */ \
       
   638     " "                                                               \
       
   639     "6:  NILL    %[tolen],0xff        \n\t" /* leftover bytes      */ \
       
   640     "    BRC     8,2f                 \n\t" /* done if none        */ \
       
   641     "    LAY     %[lenx],-1(,%[tolen]) \n\t"                          \
       
   642     "    EX      %[lenx],0(%[target]) \n\t" /* execute MVC instr   */ \
       
   643     "    BRC     15,2f                \n\t" /* skip template       */ \
       
   644     " "                                                               \
       
   645     "1:  XC      0(1,%[toaddr]),0(%[toaddr]) \n\t"                    \
       
   646     "2:  BCR     0,0                  \n\t" /* nop a branch target */ \
       
   647     : [to]       "+Q"  (_to)         /* outputs   */               \
       
   648     , [lenx]     "=a"  (lenx)                                      \
       
   649     , [len8]     "=a"  (len8)                                      \
       
   650     , [len256]   "=a"  (len256)                                    \
       
   651     , [tolen]    "=a"  (tolen)                                     \
       
   652     , [toaddr]   "=a"  (toaddr)                                    \
       
   653     , [target]   "=a"  (target)                                    \
       
   654     : [len]       "r"  (_len)        /* inputs    */               \
       
   655     : "cc"                           /* clobbered */               \
       
   656     );                                                             \
       
   657 }
       
   658 #endif
       
   659 #endif // USE_INLINE_ASM
       
   660 
       
   661 //*************************************//
       
   662 //   D I S J O I N T   C O P Y I N G   //
       
   663 //*************************************//
       
   664 
       
   665 static void pd_aligned_disjoint_words(HeapWord* from, HeapWord* to, size_t count) {
       
   666   // JVM2008: very frequent, some tests frequent.
       
   667 
       
   668   // Copy HeapWord (=DW) aligned storage. Use MVCLE in inline-asm code.
       
   669   // MVCLE guarantees DW concurrent (i.e. atomic) accesses if both the addresses of the operands
       
   670   // are DW aligned and the length is an integer multiple of a DW. Should always be true here.
       
   671   //
       
   672   // No special exploit needed. H/W discovers suitable situations itself.
       
   673   //
       
   674   // For large chunks of memory, exploit special H/W support of z/Architecture:
       
   675   // 1) copy short piece of memory to page-align address(es)
       
   676   // 2) copy largest part (all contained full pages) of memory using mvcle instruction.
       
   677   //    z/Architecture processors have special H/W support for page-aligned storage
       
   678   //    where len is an int multiple of page size. In that case, up to 4 cache lines are
       
   679   //    processed in parallel and L1 cache is not polluted.
       
   680   // 3) copy the remaining piece of memory.
       
   681   //
       
   682 #ifdef USE_INLINE_ASM
       
   683   jbyte* to_bytes   = (jbyte*)to;
       
   684   jbyte* from_bytes = (jbyte*)from;
       
   685   size_t len_bytes  = count*HeapWordSize;
       
   686 
       
   687   // Optimized copying for data less than 4k
       
   688   switch (count) {
       
   689     case 0: return;
       
   690     case 1: MOVE8_ATOMIC_1(to,from)
       
   691             return;
       
   692     case 2: MOVE8_ATOMIC_2(to,from)
       
   693             return;
       
   694 //  case 3: MOVE8_ATOMIC_3(to,from)
       
   695 //          return;
       
   696 //  case 4: MOVE8_ATOMIC_4(to,from)
       
   697 //          return;
       
   698     default:
       
   699       if (len_bytes <= 4096) {
       
   700         MVC_MULTI(to,from,len_bytes)
       
   701         return;
       
   702       }
       
   703       // else
       
   704       MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
       
   705       return;
       
   706   }
       
   707 #else
       
   708   // Fallback code.
       
   709   switch (count) {
       
   710     case 0:
       
   711       return;
       
   712 
       
   713     case 1:
       
   714       *to = *from;
       
   715       return;
       
   716 
       
   717     case 2:
       
   718       *to++ = *from++;
       
   719       *to = *from;
       
   720       return;
       
   721 
       
   722     case 3:
       
   723       *to++ = *from++;
       
   724       *to++ = *from++;
       
   725       *to = *from;
       
   726       return;
       
   727 
       
   728     case 4:
       
   729       *to++ = *from++;
       
   730       *to++ = *from++;
       
   731       *to++ = *from++;
       
   732       *to = *from;
       
   733       return;
       
   734 
       
   735     default:
       
   736       while (count-- > 0)
       
   737         *(to++) = *(from++);
       
   738       return;
       
   739   }
       
   740 #endif
       
   741 }
       
   742 
       
   743 static void pd_disjoint_words_atomic(HeapWord* from, HeapWord* to, size_t count) {
       
   744   // JVM2008: < 4k calls.
       
   745   assert(((((size_t)from) & 0x07L) | (((size_t)to) & 0x07L)) == 0, "No atomic copy w/o aligned data");
       
   746   pd_aligned_disjoint_words(from, to, count); // Rare calls -> just delegate.
       
   747 }
       
   748 
       
   749 static void pd_disjoint_words(HeapWord* from, HeapWord* to, size_t count) {
       
   750   // JVM2008: very rare.
       
   751   pd_aligned_disjoint_words(from, to, count); // Rare calls -> just delegate.
       
   752 }
       
   753 
       
   754 
       
   755 //*************************************//
       
   756 //   C O N J O I N T   C O P Y I N G   //
       
   757 //*************************************//
       
   758 
       
   759 static void pd_aligned_conjoint_words(HeapWord* from, HeapWord* to, size_t count) {
       
   760   // JVM2008: between some and lower end of frequent.
       
   761 
       
   762 #ifdef USE_INLINE_ASM
       
   763   size_t  count_in = count;
       
   764   if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) {
       
   765     switch (count_in) {
       
   766       case 4: COPY8_ATOMIC_4(to,from)
       
   767               return;
       
   768       case 3: COPY8_ATOMIC_3(to,from)
       
   769               return;
       
   770       case 2: COPY8_ATOMIC_2(to,from)
       
   771               return;
       
   772       case 1: COPY8_ATOMIC_1(to,from)
       
   773               return;
       
   774       case 0: return;
       
   775       default:
       
   776         from += count_in;
       
   777         to   += count_in;
       
   778         while (count_in-- > 0)
       
   779           *(--to) = *(--from); // Copy backwards, areas overlap destructively.
       
   780         return;
       
   781     }
       
   782   }
       
   783   // else
       
   784   jbyte* to_bytes   = (jbyte*)to;
       
   785   jbyte* from_bytes = (jbyte*)from;
       
   786   size_t len_bytes  = count_in*BytesPerLong;
       
   787   MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
       
   788   return;
       
   789 #else
       
   790   // Fallback code.
       
   791   if (has_destructive_overlap((char*)from, (char*)to, count*BytesPerLong)) {
       
   792     HeapWord t1, t2, t3;
       
   793     switch (count) {
       
   794       case 0:
       
   795         return;
       
   796 
       
   797       case 1:
       
   798         *to = *from;
       
   799         return;
       
   800 
       
   801       case 2:
       
   802         t1 = *(from+1);
       
   803         *to = *from;
       
   804         *(to+1) = t1;
       
   805         return;
       
   806 
       
   807       case 3:
       
   808         t1 = *(from+1);
       
   809         t2 = *(from+2);
       
   810         *to = *from;
       
   811         *(to+1) = t1;
       
   812         *(to+2) = t2;
       
   813         return;
       
   814 
       
   815       case 4:
       
   816         t1 = *(from+1);
       
   817         t2 = *(from+2);
       
   818         t3 = *(from+3);
       
   819         *to = *from;
       
   820         *(to+1) = t1;
       
   821         *(to+2) = t2;
       
   822         *(to+3) = t3;
       
   823         return;
       
   824 
       
   825       default:
       
   826         from += count;
       
   827         to   += count;
       
   828         while (count-- > 0)
       
   829           *(--to) = *(--from); // Copy backwards, areas overlap destructively.
       
   830         return;
       
   831     }
       
   832   }
       
   833   // else
       
   834   // Just delegate. HeapWords are optimally aligned anyway.
       
   835   pd_aligned_disjoint_words(from, to, count);
       
   836 #endif
       
   837 }
       
   838 
       
   839 static void pd_conjoint_words(HeapWord* from, HeapWord* to, size_t count) {
       
   840 
       
   841   // Just delegate. HeapWords are optimally aligned anyway.
       
   842   pd_aligned_conjoint_words(from, to, count);
       
   843 }
       
   844 
       
   845 static void pd_conjoint_bytes(void* from, void* to, size_t count) {
       
   846 
       
   847 #ifdef USE_INLINE_ASM
       
   848   size_t count_in = count;
       
   849   if (has_destructive_overlap((char*)from, (char*)to, count_in))
       
   850     (void)memmove(to, from, count_in);
       
   851   else {
       
   852     jbyte*  to_bytes   = (jbyte*)to;
       
   853     jbyte*  from_bytes = (jbyte*)from;
       
   854     size_t  len_bytes  = count_in;
       
   855     MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
       
   856   }
       
   857 #else
       
   858   if (has_destructive_overlap((char*)from, (char*)to, count))
       
   859     (void)memmove(to, from, count);
       
   860   else
       
   861     (void)memcpy(to, from, count);
       
   862 #endif
       
   863 }
       
   864 
       
   865 //**************************************************//
       
   866 //   C O N J O I N T  A T O M I C   C O P Y I N G   //
       
   867 //**************************************************//
       
   868 
       
   869 static void pd_conjoint_bytes_atomic(void* from, void* to, size_t count) {
       
   870   // Call arraycopy stubs to do the job.
       
   871   pd_conjoint_bytes(from, to, count); // bytes are always accessed atomically.
       
   872 }
       
   873 
       
   874 static void pd_conjoint_jshorts_atomic(jshort* from, jshort* to, size_t count) {
       
   875 
       
   876 #ifdef USE_INLINE_ASM
       
   877   size_t count_in = count;
       
   878   if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerShort)) {
       
   879     // Use optimizations from shared code where no z-specific optimization exists.
       
   880     copy_conjoint_jshorts_atomic(from, to, count);
       
   881   } else {
       
   882     jbyte* to_bytes   = (jbyte*)to;
       
   883     jbyte* from_bytes = (jbyte*)from;
       
   884     size_t len_bytes  = count_in*BytesPerShort;
       
   885     MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
       
   886   }
       
   887 #else
       
   888   // Use optimizations from shared code where no z-specific optimization exists.
       
   889   copy_conjoint_jshorts_atomic(from, to, count);
       
   890 #endif
       
   891 }
       
   892 
       
   893 static void pd_conjoint_jints_atomic(jint* from, jint* to, size_t count) {
       
   894 
       
   895 #ifdef USE_INLINE_ASM
       
   896   size_t count_in = count;
       
   897   if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerInt)) {
       
   898     switch (count_in) {
       
   899       case 4: COPY4_ATOMIC_4(to,from)
       
   900               return;
       
   901       case 3: COPY4_ATOMIC_3(to,from)
       
   902               return;
       
   903       case 2: COPY4_ATOMIC_2(to,from)
       
   904               return;
       
   905       case 1: COPY4_ATOMIC_1(to,from)
       
   906               return;
       
   907       case 0: return;
       
   908       default:
       
   909         // Use optimizations from shared code where no z-specific optimization exists.
       
   910         copy_conjoint_jints_atomic(from, to, count_in);
       
   911         return;
       
   912     }
       
   913   }
       
   914   // else
       
   915   jbyte* to_bytes   = (jbyte*)to;
       
   916   jbyte* from_bytes = (jbyte*)from;
       
   917   size_t len_bytes  = count_in*BytesPerInt;
       
   918   MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
       
   919 #else
       
   920   // Use optimizations from shared code where no z-specific optimization exists.
       
   921   copy_conjoint_jints_atomic(from, to, count);
       
   922 #endif
       
   923 }
       
   924 
       
   925 static void pd_conjoint_jlongs_atomic(jlong* from, jlong* to, size_t count) {
       
   926 
       
   927 #ifdef USE_INLINE_ASM
       
   928   size_t count_in = count;
       
   929   if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) {
       
   930     switch (count_in) {
       
   931       case 4: COPY8_ATOMIC_4(to,from) return;
       
   932       case 3: COPY8_ATOMIC_3(to,from) return;
       
   933       case 2: COPY8_ATOMIC_2(to,from) return;
       
   934       case 1: COPY8_ATOMIC_1(to,from) return;
       
   935       case 0: return;
       
   936       default:
       
   937         from += count_in;
       
   938         to   += count_in;
       
   939         while (count_in-- > 0) { *(--to) = *(--from); } // Copy backwards, areas overlap destructively.
       
   940         return;
       
   941     }
       
   942   }
       
   943   // else {
       
   944   jbyte* to_bytes   = (jbyte*)to;
       
   945   jbyte* from_bytes = (jbyte*)from;
       
   946   size_t len_bytes  = count_in*BytesPerLong;
       
   947   MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
       
   948 #else
       
   949   size_t count_in = count;
       
   950   if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) {
       
   951     if (count_in < 8) {
       
   952       from += count_in;
       
   953       to   += count_in;
       
   954       while (count_in-- > 0)
       
   955          *(--to) = *(--from); // Copy backwards, areas overlap destructively.
       
   956       return;
       
   957     }
       
   958     // else {
       
   959     from += count_in-1;
       
   960     to   += count_in-1;
       
   961     if (count_in&0x01) {
       
   962       *(to--) = *(from--);
       
   963       count_in--;
       
   964     }
       
   965     for (; count_in>0; count_in-=2) {
       
   966       *to     = *from;
       
   967       *(to-1) = *(from-1);
       
   968       to     -= 2;
       
   969       from   -= 2;
       
   970     }
       
   971   }
       
   972   else
       
   973     pd_aligned_disjoint_words((HeapWord*)from, (HeapWord*)to, count_in); // rare calls -> just delegate.
       
   974 #endif
       
   975 }
       
   976 
       
   977 static void pd_conjoint_oops_atomic(oop* from, oop* to, size_t count) {
       
   978 
       
   979 #ifdef USE_INLINE_ASM
       
   980   size_t count_in = count;
       
   981   if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerOop)) {
       
   982     switch (count_in) {
       
   983       case 4: COPY8_ATOMIC_4(to,from) return;
       
   984       case 3: COPY8_ATOMIC_3(to,from) return;
       
   985       case 2: COPY8_ATOMIC_2(to,from) return;
       
   986       case 1: COPY8_ATOMIC_1(to,from) return;
       
   987       case 0: return;
       
   988       default:
       
   989         from += count_in;
       
   990         to   += count_in;
       
   991         while (count_in-- > 0) { *(--to) = *(--from); } // Copy backwards, areas overlap destructively.
       
   992         return;
       
   993     }
       
   994   }
       
   995   // else
       
   996   jbyte* to_bytes   = (jbyte*)to;
       
   997   jbyte* from_bytes = (jbyte*)from;
       
   998   size_t len_bytes  = count_in*BytesPerOop;
       
   999   MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
       
  1000 #else
       
  1001   size_t count_in = count;
       
  1002   if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerOop)) {
       
  1003     from += count_in;
       
  1004     to   += count_in;
       
  1005     while (count_in-- > 0) *(--to) = *(--from); // Copy backwards, areas overlap destructively.
       
  1006     return;
       
  1007   }
       
  1008   // else
       
  1009   pd_aligned_disjoint_words((HeapWord*)from, (HeapWord*)to, count_in); // rare calls -> just delegate.
       
  1010   return;
       
  1011 #endif
       
  1012 }
       
  1013 
       
  1014 static void pd_arrayof_conjoint_bytes(HeapWord* from, HeapWord* to, size_t count) {
       
  1015   pd_conjoint_bytes_atomic(from, to, count);
       
  1016 }
       
  1017 
       
  1018 static void pd_arrayof_conjoint_jshorts(HeapWord* from, HeapWord* to, size_t count) {
       
  1019   pd_conjoint_jshorts_atomic((jshort*)from, (jshort*)to, count);
       
  1020 }
       
  1021 
       
  1022 static void pd_arrayof_conjoint_jints(HeapWord* from, HeapWord* to, size_t count) {
       
  1023   pd_conjoint_jints_atomic((jint*)from, (jint*)to, count);
       
  1024 }
       
  1025 
       
  1026 static void pd_arrayof_conjoint_jlongs(HeapWord* from, HeapWord* to, size_t count) {
       
  1027   pd_conjoint_jlongs_atomic((jlong*)from, (jlong*)to, count);
       
  1028 }
       
  1029 
       
  1030 static void pd_arrayof_conjoint_oops(HeapWord* from, HeapWord* to, size_t count) {
       
  1031   pd_conjoint_oops_atomic((oop*)from, (oop*)to, count);
       
  1032 }
       
  1033 
       
  1034 //**********************************************//
       
  1035 //  M E M O R Y   I N I T I A L I S A T I O N   //
       
  1036 //**********************************************//
       
  1037 
       
  1038 static void pd_fill_to_bytes(void* to, size_t count, jubyte value) {
       
  1039   // JVM2008: very rare, only in some tests.
       
  1040 #ifdef USE_INLINE_ASM
       
  1041   // Initialize storage to a given value. Use memset instead of copy loop.
       
  1042   // For large chunks of memory, exploit special H/W support of z/Architecture:
       
  1043   // 1) init short piece of memory to page-align address
       
  1044   // 2) init largest part (all contained full pages) of memory using mvcle instruction.
       
  1045   //    z/Architecture processors have special H/W support for page-aligned storage
       
  1046   //    where len is an int multiple of page size. In that case, up to 4 cache lines are
       
  1047   //    processed in parallel and L1 cache is not polluted.
       
  1048   // 3) init the remaining piece of memory.
       
  1049   // Atomicity cannot really be an issue since gcc implements the loop body with XC anyway.
       
  1050   // If atomicity is a problem, we have to prevent gcc optimization. Best workaround: inline asm.
       
  1051 
       
  1052   jbyte*  to_bytes  = (jbyte*)to;
       
  1053   size_t  len_bytes = count;
       
  1054 
       
  1055   MVCLE_MEMINIT(to_bytes, value, len_bytes)
       
  1056 
       
  1057 #else
       
  1058   // Memset does the best job possible: loop over 256-byte MVCs, with
       
  1059   // the last MVC EXecuted. With the -mmvcle option, initialization
       
  1060   // is done using MVCLE -> slight advantage for large areas.
       
  1061   (void)memset(to, value, count);
       
  1062 #endif
       
  1063 }
       
  1064 
       
  1065 static void pd_fill_to_words(HeapWord* tohw, size_t count, juint value) {
       
  1066   // Occurs in dbg builds only. Usually memory poisoning with BAADBABE, DEADBEEF, etc.
       
  1067   // JVM2008: < 4k calls.
       
  1068   if (value == 0) {
       
  1069     pd_zero_to_words(tohw, count);
       
  1070     return;
       
  1071   }
       
  1072   if (value == ~(juint)(0)) {
       
  1073     pd_fill_to_bytes(tohw, count*HeapWordSize, (jubyte)(~(juint)(0)));
       
  1074     return;
       
  1075   }
       
  1076   julong* to = (julong*) tohw;
       
  1077   julong  v  = ((julong) value << 32) | value;
       
  1078   while (count-- > 0) {
       
  1079     *to++ = v;
       
  1080   }
       
  1081 }
       
  1082 
       
  1083 static void pd_fill_to_aligned_words(HeapWord* tohw, size_t count, juint value) {
       
  1084   // JVM2008: very frequent, but virtually all calls are with value == 0.
       
  1085   pd_fill_to_words(tohw, count, value);
       
  1086 }
       
  1087 
       
  1088 //**********************************//
       
  1089 //  M E M O R Y   C L E A R I N G   //
       
  1090 //**********************************//
       
  1091 
       
  1092 // Delegate to pd_zero_to_bytes. It also works HeapWord-atomic.
       
  1093 // Distinguish between simple and large zero_to_words.
       
  1094 static void pd_zero_to_words(HeapWord* tohw, size_t count) {
       
  1095   pd_zero_to_bytes(tohw, count*HeapWordSize);
       
  1096 }
       
  1097 
       
  1098 // Delegate to pd_zero_to_bytes. It also works HeapWord-atomic.
       
  1099 static void pd_zero_to_words_large(HeapWord* tohw, size_t count) {
       
  1100   // JVM2008: generally frequent, some tests show very frequent calls.
       
  1101   pd_zero_to_bytes(tohw, count*HeapWordSize);
       
  1102 }
       
  1103 
       
  1104 static void pd_zero_to_bytes(void* to, size_t count) {
       
  1105   // JVM2008: some calls (generally), some tests frequent
       
  1106 #ifdef USE_INLINE_ASM
       
  1107   // Even zero_to_bytes() requires HeapWord-atomic, or, at least, sequential
       
  1108   // zeroing of the memory. MVCLE is not fit for that job:
       
  1109   //   "As observed by other CPUs and by the channel subsystem,
       
  1110   //    that portion of the first operand which is filled
       
  1111   //    with the padding byte is not necessarily stored into in
       
  1112   //    a left-to-right direction and may appear to be stored
       
  1113   //    into more than once."
       
  1114   // Therefore, implementation was changed to use (multiple) XC instructions.
       
  1115 
       
  1116   const long line_size = 256;
       
  1117   jbyte* to_bytes  = (jbyte*)to;
       
  1118   size_t len_bytes = count;
       
  1119 
       
  1120   if (len_bytes <= line_size) {
       
  1121     XC_MEMZERO_256(to_bytes, len_bytes);
       
  1122   } else {
       
  1123     XC_MEMZERO_ANY(to_bytes, len_bytes);
       
  1124   }
       
  1125 
       
  1126 #else
       
  1127   // Memset does the best job possible: loop over 256-byte MVCs, with
       
  1128   // the last MVC EXecuted. With the -mmvcle option, initialization
       
  1129   // is done using MVCLE -> slight advantage for large areas.
       
  1130   (void)memset(to, 0, count);
       
  1131 #endif
       
  1132 }
       
  1133 
       
  1134 #endif // CPU_S390_VM_COPY_S390_HPP