42065
|
1 |
/*
|
|
2 |
* Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
|
|
3 |
* Copyright (c) 2016 SAP SE. All rights reserved.
|
|
4 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
5 |
*
|
|
6 |
* This code is free software; you can redistribute it and/or modify it
|
|
7 |
* under the terms of the GNU General Public License version 2 only, as
|
|
8 |
* published by the Free Software Foundation.
|
|
9 |
*
|
|
10 |
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
11 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
12 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
13 |
* version 2 for more details (a copy is included in the LICENSE file that
|
|
14 |
* accompanied this code).
|
|
15 |
*
|
|
16 |
* You should have received a copy of the GNU General Public License version
|
|
17 |
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
18 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
19 |
*
|
|
20 |
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
21 |
* or visit www.oracle.com if you need additional information or have any
|
|
22 |
* questions.
|
|
23 |
*
|
|
24 |
*/
|
|
25 |
|
|
26 |
// Major contributions by LS
|
|
27 |
|
|
28 |
#ifndef CPU_S390_VM_COPY_S390_HPP
|
|
29 |
#define CPU_S390_VM_COPY_S390_HPP
|
|
30 |
|
|
31 |
// Inline functions for memory copy and fill.
|
|
32 |
|
|
33 |
// HeapWordSize (the size of class HeapWord) is 8 Bytes (the size of a
|
|
34 |
// pointer variable), since we always run the _LP64 model. As a consequence,
|
|
35 |
// HeapWord* memory ranges are always assumed to be doubleword-aligned,
|
|
36 |
// having a size which is an integer multiple of HeapWordSize.
|
|
37 |
//
|
|
38 |
// Dealing only with doubleword-aligned doubleword units has important
|
|
39 |
// positive performance and data access consequences. Many of the move
|
|
40 |
// instructions perform particularly well under these circumstances.
|
|
41 |
// Data access is "doubleword-concurrent", except for MVC and XC.
|
|
42 |
// Furthermore, data access can be forced to be sequential (MVCL and MVCLE)
|
|
43 |
// by use of the special padding byte 0xb1, where required. For copying,
|
|
44 |
// we use padding byte 0xb0 to prevent the D-cache from being polluted.
|
|
45 |
//
|
|
46 |
// On z/Architecture, gcc optimizes memcpy into a series of MVC instructions.
|
|
47 |
// This is optimal, even if just one HeapWord is copied. However, MVC
|
|
48 |
// copying is not atomic, i.e. not "doubleword concurrent" by definition.
|
|
49 |
//
|
|
50 |
// If the -mmvcle compiler option is specified, memcpy translates into
|
|
51 |
// code such that the entire memory range is copied or preset with just
|
|
52 |
// one MVCLE instruction.
|
|
53 |
//
|
|
54 |
// *to = *from is transformed into a MVC instruction already with -O1.
|
|
55 |
// Thus, for atomic copy operations, (inline) assembler code is required
|
|
56 |
// to guarantee atomic data accesses.
|
|
57 |
//
|
|
58 |
// For large (len >= MVCLEThreshold) chunks of memory, we exploit
|
|
59 |
// special H/W support of z/Architecture:
|
|
60 |
// 1) copy short piece of memory to page-align address(es)
|
|
61 |
// 2) copy largest part (all contained full pages) of memory using mvcle instruction.
|
|
62 |
// z/Architecture processors have special H/W support for page-aligned storage
|
|
63 |
// where len is an int multiple of page size. In that case, up to 4 cache lines are
|
|
64 |
// processed in parallel and L1 cache is not polluted.
|
|
65 |
// 3) copy the remaining piece of memory.
|
|
66 |
//
|
|
67 |
// Measurement classifications:
|
|
68 |
// very rare - <= 10.000 calls AND <= 1.000 usec elapsed
|
|
69 |
// rare - <= 100.000 calls AND <= 10.000 usec elapsed
|
|
70 |
// some - <= 1.000.000 calls AND <= 100.000 usec elapsed
|
|
71 |
// freq - <= 10.000.000 calls AND <= 1.000.000 usec elapsed
|
|
72 |
// very freq - > 10.000.000 calls OR > 1.000.000 usec elapsed
|
|
73 |
|
|
74 |
#undef USE_INLINE_ASM
|
|
75 |
|
|
76 |
static void copy_conjoint_jshorts_atomic(jshort* from, jshort* to, size_t count) {
|
|
77 |
if (from > to) {
|
|
78 |
while (count-- > 0) {
|
|
79 |
// Copy forwards
|
|
80 |
*to++ = *from++;
|
|
81 |
}
|
|
82 |
} else {
|
|
83 |
from += count - 1;
|
|
84 |
to += count - 1;
|
|
85 |
while (count-- > 0) {
|
|
86 |
// Copy backwards
|
|
87 |
*to-- = *from--;
|
|
88 |
}
|
|
89 |
}
|
|
90 |
}
|
|
91 |
|
|
92 |
static void copy_conjoint_jints_atomic(jint* from, jint* to, size_t count) {
|
|
93 |
if (from > to) {
|
|
94 |
while (count-- > 0) {
|
|
95 |
// Copy forwards
|
|
96 |
*to++ = *from++;
|
|
97 |
}
|
|
98 |
} else {
|
|
99 |
from += count - 1;
|
|
100 |
to += count - 1;
|
|
101 |
while (count-- > 0) {
|
|
102 |
// Copy backwards
|
|
103 |
*to-- = *from--;
|
|
104 |
}
|
|
105 |
}
|
|
106 |
}
|
|
107 |
|
|
108 |
static bool has_destructive_overlap(char* from, char* to, size_t byte_count) {
|
|
109 |
return (from < to) && ((to-from) < (ptrdiff_t)byte_count);
|
|
110 |
}
|
|
111 |
|
|
112 |
#ifdef USE_INLINE_ASM
|
|
113 |
|
|
114 |
//--------------------------------------------------------------
|
|
115 |
// Atomic copying. Atomicity is given by the minimum of source
|
|
116 |
// and target alignment. Refer to mail comm with Tim Slegel/IBM.
|
|
117 |
// Only usable for disjoint source and target.
|
|
118 |
//--------------------------------------------------------------
|
|
119 |
#define MOVE8_ATOMIC_4(_to,_from) { \
|
|
120 |
unsigned long toaddr; \
|
|
121 |
unsigned long fromaddr; \
|
|
122 |
asm( \
|
|
123 |
"LG %[toaddr],%[to] \n\t" /* address of to area */ \
|
|
124 |
"LG %[fromaddr],%[from] \n\t" /* address of from area */ \
|
|
125 |
"MVC 0(32,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \
|
|
126 |
: [to] "+Q" (_to) /* outputs */ \
|
|
127 |
, [from] "+Q" (_from) \
|
|
128 |
, [toaddr] "=a" (toaddr) \
|
|
129 |
, [fromaddr] "=a" (fromaddr) \
|
|
130 |
: \
|
|
131 |
: "cc" /* clobbered */ \
|
|
132 |
); \
|
|
133 |
}
|
|
134 |
#define MOVE8_ATOMIC_3(_to,_from) { \
|
|
135 |
unsigned long toaddr; \
|
|
136 |
unsigned long fromaddr; \
|
|
137 |
asm( \
|
|
138 |
"LG %[toaddr],%[to] \n\t" /* address of to area */ \
|
|
139 |
"LG %[fromaddr],%[from] \n\t" /* address of from area */ \
|
|
140 |
"MVC 0(24,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \
|
|
141 |
: [to] "+Q" (_to) /* outputs */ \
|
|
142 |
, [from] "+Q" (_from) \
|
|
143 |
, [toaddr] "=a" (toaddr) \
|
|
144 |
, [fromaddr] "=a" (fromaddr) \
|
|
145 |
: \
|
|
146 |
: "cc" /* clobbered */ \
|
|
147 |
); \
|
|
148 |
}
|
|
149 |
#define MOVE8_ATOMIC_2(_to,_from) { \
|
|
150 |
unsigned long toaddr; \
|
|
151 |
unsigned long fromaddr; \
|
|
152 |
asm( \
|
|
153 |
"LG %[toaddr],%[to] \n\t" /* address of to area */ \
|
|
154 |
"LG %[fromaddr],%[from] \n\t" /* address of from area */ \
|
|
155 |
"MVC 0(16,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \
|
|
156 |
: [to] "+Q" (_to) /* outputs */ \
|
|
157 |
, [from] "+Q" (_from) \
|
|
158 |
, [toaddr] "=a" (toaddr) \
|
|
159 |
, [fromaddr] "=a" (fromaddr) \
|
|
160 |
: \
|
|
161 |
: "cc" /* clobbered */ \
|
|
162 |
); \
|
|
163 |
}
|
|
164 |
#define MOVE8_ATOMIC_1(_to,_from) { \
|
|
165 |
unsigned long toaddr; \
|
|
166 |
unsigned long fromaddr; \
|
|
167 |
asm( \
|
|
168 |
"LG %[toaddr],%[to] \n\t" /* address of to area */ \
|
|
169 |
"LG %[fromaddr],%[from] \n\t" /* address of from area */ \
|
|
170 |
"MVC 0(8,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \
|
|
171 |
: [to] "+Q" (_to) /* outputs */ \
|
|
172 |
, [from] "+Q" (_from) \
|
|
173 |
, [toaddr] "=a" (toaddr) \
|
|
174 |
, [fromaddr] "=a" (fromaddr) \
|
|
175 |
: \
|
|
176 |
: "cc" /* clobbered */ \
|
|
177 |
); \
|
|
178 |
}
|
|
179 |
|
|
180 |
//--------------------------------------------------------------
|
|
181 |
// Atomic copying of 8-byte entities.
|
|
182 |
// Conjoint/disjoint property does not matter. Entities are first
|
|
183 |
// loaded and then stored.
|
|
184 |
// _to and _from must be 8-byte aligned.
|
|
185 |
//--------------------------------------------------------------
|
|
186 |
#define COPY8_ATOMIC_4(_to,_from) { \
|
|
187 |
unsigned long toaddr; \
|
|
188 |
asm( \
|
|
189 |
"LG 3,%[from] \n\t" /* address of from area */ \
|
|
190 |
"LG %[toaddr],%[to] \n\t" /* address of to area */ \
|
|
191 |
"LMG 0,3,0(3) \n\t" /* load data */ \
|
|
192 |
"STMG 0,3,0(%[toaddr]) \n\t" /* store data */ \
|
|
193 |
: [to] "+Q" (_to) /* outputs */ \
|
|
194 |
, [from] "+Q" (_from) /* outputs */ \
|
|
195 |
, [toaddr] "=a" (toaddr) /* inputs */ \
|
|
196 |
: \
|
|
197 |
: "cc", "r0", "r1", "r2", "r3" /* clobbered */ \
|
|
198 |
); \
|
|
199 |
}
|
|
200 |
#define COPY8_ATOMIC_3(_to,_from) { \
|
|
201 |
unsigned long toaddr; \
|
|
202 |
asm( \
|
|
203 |
"LG 2,%[from] \n\t" /* address of from area */ \
|
|
204 |
"LG %[toaddr],%[to] \n\t" /* address of to area */ \
|
|
205 |
"LMG 0,2,0(2) \n\t" /* load data */ \
|
|
206 |
"STMG 0,2,0(%[toaddr]) \n\t" /* store data */ \
|
|
207 |
: [to] "+Q" (_to) /* outputs */ \
|
|
208 |
, [from] "+Q" (_from) /* outputs */ \
|
|
209 |
, [toaddr] "=a" (toaddr) /* inputs */ \
|
|
210 |
: \
|
|
211 |
: "cc", "r0", "r1", "r2" /* clobbered */ \
|
|
212 |
); \
|
|
213 |
}
|
|
214 |
#define COPY8_ATOMIC_2(_to,_from) { \
|
|
215 |
unsigned long toaddr; \
|
|
216 |
asm( \
|
|
217 |
"LG 1,%[from] \n\t" /* address of from area */ \
|
|
218 |
"LG %[toaddr],%[to] \n\t" /* address of to area */ \
|
|
219 |
"LMG 0,1,0(1) \n\t" /* load data */ \
|
|
220 |
"STMG 0,1,0(%[toaddr]) \n\t" /* store data */ \
|
|
221 |
: [to] "+Q" (_to) /* outputs */ \
|
|
222 |
, [from] "+Q" (_from) /* outputs */ \
|
|
223 |
, [toaddr] "=a" (toaddr) /* inputs */ \
|
|
224 |
: \
|
|
225 |
: "cc", "r0", "r1" /* clobbered */ \
|
|
226 |
); \
|
|
227 |
}
|
|
228 |
#define COPY8_ATOMIC_1(_to,_from) { \
|
|
229 |
unsigned long addr; \
|
|
230 |
asm( \
|
|
231 |
"LG %[addr],%[from] \n\t" /* address of from area */ \
|
|
232 |
"LG 0,0(0,%[addr]) \n\t" /* load data */ \
|
|
233 |
"LG %[addr],%[to] \n\t" /* address of to area */ \
|
|
234 |
"STG 0,0(0,%[addr]) \n\t" /* store data */ \
|
|
235 |
: [to] "+Q" (_to) /* outputs */ \
|
|
236 |
, [from] "+Q" (_from) /* outputs */ \
|
|
237 |
, [addr] "=a" (addr) /* inputs */ \
|
|
238 |
: \
|
|
239 |
: "cc", "r0" /* clobbered */ \
|
|
240 |
); \
|
|
241 |
}
|
|
242 |
|
|
243 |
//--------------------------------------------------------------
|
|
244 |
// Atomic copying of 4-byte entities.
|
|
245 |
// Exactly 4 (four) entities are copied.
|
|
246 |
// Conjoint/disjoint property does not matter. Entities are first
|
|
247 |
// loaded and then stored.
|
|
248 |
// _to and _from must be 4-byte aligned.
|
|
249 |
//--------------------------------------------------------------
|
|
250 |
#define COPY4_ATOMIC_4(_to,_from) { \
|
|
251 |
unsigned long toaddr; \
|
|
252 |
asm( \
|
|
253 |
"LG 3,%[from] \n\t" /* address of from area */ \
|
|
254 |
"LG %[toaddr],%[to] \n\t" /* address of to area */ \
|
|
255 |
"LM 0,3,0(3) \n\t" /* load data */ \
|
|
256 |
"STM 0,3,0(%[toaddr]) \n\t" /* store data */ \
|
|
257 |
: [to] "+Q" (_to) /* outputs */ \
|
|
258 |
, [from] "+Q" (_from) /* outputs */ \
|
|
259 |
, [toaddr] "=a" (toaddr) /* inputs */ \
|
|
260 |
: \
|
|
261 |
: "cc", "r0", "r1", "r2", "r3" /* clobbered */ \
|
|
262 |
); \
|
|
263 |
}
|
|
264 |
#define COPY4_ATOMIC_3(_to,_from) { \
|
|
265 |
unsigned long toaddr; \
|
|
266 |
asm( \
|
|
267 |
"LG 2,%[from] \n\t" /* address of from area */ \
|
|
268 |
"LG %[toaddr],%[to] \n\t" /* address of to area */ \
|
|
269 |
"LM 0,2,0(2) \n\t" /* load data */ \
|
|
270 |
"STM 0,2,0(%[toaddr]) \n\t" /* store data */ \
|
|
271 |
: [to] "+Q" (_to) /* outputs */ \
|
|
272 |
, [from] "+Q" (_from) /* outputs */ \
|
|
273 |
, [toaddr] "=a" (toaddr) /* inputs */ \
|
|
274 |
: \
|
|
275 |
: "cc", "r0", "r1", "r2" /* clobbered */ \
|
|
276 |
); \
|
|
277 |
}
|
|
278 |
#define COPY4_ATOMIC_2(_to,_from) { \
|
|
279 |
unsigned long toaddr; \
|
|
280 |
asm( \
|
|
281 |
"LG 1,%[from] \n\t" /* address of from area */ \
|
|
282 |
"LG %[toaddr],%[to] \n\t" /* address of to area */ \
|
|
283 |
"LM 0,1,0(1) \n\t" /* load data */ \
|
|
284 |
"STM 0,1,0(%[toaddr]) \n\t" /* store data */ \
|
|
285 |
: [to] "+Q" (_to) /* outputs */ \
|
|
286 |
, [from] "+Q" (_from) /* outputs */ \
|
|
287 |
, [toaddr] "=a" (toaddr) /* inputs */ \
|
|
288 |
: \
|
|
289 |
: "cc", "r0", "r1" /* clobbered */ \
|
|
290 |
); \
|
|
291 |
}
|
|
292 |
#define COPY4_ATOMIC_1(_to,_from) { \
|
|
293 |
unsigned long addr; \
|
|
294 |
asm( \
|
|
295 |
"LG %[addr],%[from] \n\t" /* address of from area */ \
|
|
296 |
"L 0,0(0,%[addr]) \n\t" /* load data */ \
|
|
297 |
"LG %[addr],%[to] \n\t" /* address of to area */ \
|
|
298 |
"ST 0,0(0,%[addr]) \n\t" /* store data */ \
|
|
299 |
: [to] "+Q" (_to) /* outputs */ \
|
|
300 |
, [from] "+Q" (_from) /* outputs */ \
|
|
301 |
, [addr] "=a" (addr) /* inputs */ \
|
|
302 |
: \
|
|
303 |
: "cc", "r0" /* clobbered */ \
|
|
304 |
); \
|
|
305 |
}
|
|
306 |
|
|
307 |
#if 0 // Waiting for gcc to support EXRL.
|
|
308 |
#define MVC_MEMCOPY(_to,_from,_len) \
|
|
309 |
if (VM_Version::has_ExecuteExtensions()) { \
|
|
310 |
asm("\t" \
|
|
311 |
" LAY 1,-1(0,%[len]) \n\t" /* decr for MVC */ \
|
|
312 |
" EXRL 1,1f \n\t" /* execute MVC instr */ \
|
|
313 |
" BRC 15,2f \n\t" /* skip template */ \
|
|
314 |
"1: MVC 0(%[len],%[to]),0(%[from]) \n\t" \
|
|
315 |
"2: BCR 0,0 \n\t" \
|
|
316 |
: [to] "+Q" (_to) /* outputs */ \
|
|
317 |
, [from] "+Q" (_from) /* outputs */ \
|
|
318 |
: [len] "r" (_len) /* inputs */ \
|
|
319 |
: "cc", "r1" /* clobbered */ \
|
|
320 |
); \
|
|
321 |
} else { \
|
|
322 |
asm("\t" \
|
|
323 |
" LARL 2,3f \n\t" \
|
|
324 |
" LAY 1,-1(0,%[len]) \n\t" /* decr for MVC */ \
|
|
325 |
" EX 1,0(2) \n\t" /* execute MVC instr */ \
|
|
326 |
" BRC 15,4f \n\t" /* skip template */ \
|
|
327 |
"3: MVC 0(%[len],%[to]),0(%[from]) \n\t" \
|
|
328 |
"4: BCR 0,0 \n\t" \
|
|
329 |
: [to] "+Q" (_to) /* outputs */ \
|
|
330 |
, [from] "+Q" (_from) /* outputs */ \
|
|
331 |
: [len] "r" (_len) /* inputs */ \
|
|
332 |
: "cc", "r1", "r2" /* clobbered */ \
|
|
333 |
); \
|
|
334 |
}
|
|
335 |
#else
|
|
336 |
#define MVC_MEMCOPY(_to,_from,_len) \
|
|
337 |
{ unsigned long toaddr; unsigned long tolen; \
|
|
338 |
unsigned long fromaddr; unsigned long target; \
|
|
339 |
asm("\t" \
|
|
340 |
" LTGR %[tolen],%[len] \n\t" /* decr for MVC */ \
|
|
341 |
" BRC 8,2f \n\t" /* do nothing for l=0*/ \
|
|
342 |
" AGHI %[tolen],-1 \n\t" \
|
|
343 |
" LG %[toaddr],%[to] \n\t" \
|
|
344 |
" LG %[fromaddr],%[from] \n\t" \
|
|
345 |
" LARL %[target],1f \n\t" /* addr of MVC instr */ \
|
|
346 |
" EX %[tolen],0(%[target]) \n\t" /* execute MVC instr */ \
|
|
347 |
" BRC 15,2f \n\t" /* skip template */ \
|
|
348 |
"1: MVC 0(1,%[toaddr]),0(%[fromaddr]) \n\t" \
|
|
349 |
"2: BCR 0,0 \n\t" /* nop a branch target*/\
|
|
350 |
: [to] "+Q" (_to) /* outputs */ \
|
|
351 |
, [from] "+Q" (_from) \
|
|
352 |
, [tolen] "=a" (tolen) \
|
|
353 |
, [toaddr] "=a" (toaddr) \
|
|
354 |
, [fromaddr] "=a" (fromaddr) \
|
|
355 |
, [target] "=a" (target) \
|
|
356 |
: [len] "r" (_len) /* inputs */ \
|
|
357 |
: "cc" /* clobbered */ \
|
|
358 |
); \
|
|
359 |
}
|
|
360 |
#endif
|
|
361 |
|
|
362 |
#if 0 // code snippet to be used for debugging
|
|
363 |
/* ASSERT code BEGIN */ \
|
|
364 |
" LARL %[len],5f \n\t" \
|
|
365 |
" LARL %[mta],4f \n\t" \
|
|
366 |
" SLGR %[len],%[mta] \n\t" \
|
|
367 |
" CGHI %[len],16 \n\t" \
|
|
368 |
" BRC 7,9f \n\t" /* block size != 16 */ \
|
|
369 |
\
|
|
370 |
" LARL %[len],1f \n\t" \
|
|
371 |
" SLGR %[len],%[mta] \n\t" \
|
|
372 |
" CGHI %[len],256 \n\t" \
|
|
373 |
" BRC 7,9f \n\t" /* list len != 256 */ \
|
|
374 |
\
|
|
375 |
" LGR 0,0 \n\t" /* artificial SIGILL */ \
|
|
376 |
"9: BRC 7,-2 \n\t" \
|
|
377 |
" LARL %[mta],1f \n\t" /* restore MVC table begin */ \
|
|
378 |
/* ASSERT code END */
|
|
379 |
#endif
|
|
380 |
|
|
381 |
// Optimized copying for data less than 4k
|
|
382 |
// - no destructive overlap
|
|
383 |
// - 0 <= _n_bytes <= 4096
|
|
384 |
// This macro needs to be gcc-compiled with -march=z990. Otherwise, the
|
|
385 |
// LAY instruction is not available.
|
|
386 |
#define MVC_MULTI(_to,_from,_n_bytes) \
|
|
387 |
{ unsigned long toaddr; \
|
|
388 |
unsigned long fromaddr; \
|
|
389 |
unsigned long movetable; \
|
|
390 |
unsigned long len; \
|
|
391 |
asm("\t" \
|
|
392 |
" LTGFR %[len],%[nby] \n\t" \
|
|
393 |
" LG %[ta],%[to] \n\t" /* address of to area */ \
|
|
394 |
" BRC 8,1f \n\t" /* nothing to copy */ \
|
|
395 |
\
|
|
396 |
" NILL %[nby],255 \n\t" /* # bytes mod 256 */ \
|
|
397 |
" LG %[fa],%[from] \n\t" /* address of from area */ \
|
|
398 |
" BRC 8,3f \n\t" /* no rest, skip copying */ \
|
|
399 |
\
|
|
400 |
" LARL %[mta],2f \n\t" /* MVC template addr */ \
|
|
401 |
" AHI %[nby],-1 \n\t" /* adjust for EX MVC */ \
|
|
402 |
\
|
|
403 |
" EX %[nby],0(%[mta]) \n\t" /* only rightmost */ \
|
|
404 |
/* 8 bits of nby used */ \
|
|
405 |
/* Since nby is <= 4096 on entry to this code, we do need */ \
|
|
406 |
/* no zero extension before using it in addr calc. */ \
|
|
407 |
" LA %[fa],1(%[nby],%[fa]) \n\t"/* adjust from addr */ \
|
|
408 |
" LA %[ta],1(%[nby],%[ta]) \n\t"/* adjust to addr */ \
|
|
409 |
\
|
|
410 |
"3: SRAG %[nby],%[len],8 \n\t" /* # cache lines */ \
|
|
411 |
" LARL %[mta],1f \n\t" /* MVC table begin */ \
|
|
412 |
" BRC 8,1f \n\t" /* nothing to copy */ \
|
|
413 |
\
|
|
414 |
/* Insert ASSERT code here if required. */ \
|
|
415 |
\
|
|
416 |
\
|
|
417 |
" LNGFR %[nby],%[nby] \n\t" /* negative offset into */ \
|
|
418 |
" SLLG %[nby],%[nby],4 \n\t" /* MVC table 16-byte blocks */ \
|
|
419 |
" BC 15,0(%[nby],%[mta]) \n\t" /* branch to block #ncl */ \
|
|
420 |
\
|
|
421 |
"2: MVC 0(1,%[ta]),0(%[fa]) \n\t" /* MVC template */ \
|
|
422 |
\
|
|
423 |
"4: MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 4096 == l */ \
|
|
424 |
" LAY %[ta],256(0,%[ta]) \n\t" \
|
|
425 |
" LA %[fa],256(0,%[fa]) \n\t" \
|
|
426 |
"5: MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 3840 <= l < 4096 */ \
|
|
427 |
" LAY %[ta],256(0,%[ta]) \n\t" \
|
|
428 |
" LA %[fa],256(0,%[fa]) \n\t" \
|
|
429 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 3548 <= l < 3328 */ \
|
|
430 |
" LAY %[ta],256(0,%[ta]) \n\t" \
|
|
431 |
" LA %[fa],256(0,%[fa]) \n\t" \
|
|
432 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 3328 <= l < 3328 */ \
|
|
433 |
" LAY %[ta],256(0,%[ta]) \n\t" \
|
|
434 |
" LA %[fa],256(0,%[fa]) \n\t" \
|
|
435 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 3072 <= l < 3328 */ \
|
|
436 |
" LAY %[ta],256(0,%[ta]) \n\t" \
|
|
437 |
" LA %[fa],256(0,%[fa]) \n\t" \
|
|
438 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 2816 <= l < 3072 */ \
|
|
439 |
" LAY %[ta],256(0,%[ta]) \n\t" \
|
|
440 |
" LA %[fa],256(0,%[fa]) \n\t" \
|
|
441 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 2560 <= l < 2816 */ \
|
|
442 |
" LAY %[ta],256(0,%[ta]) \n\t" \
|
|
443 |
" LA %[fa],256(0,%[fa]) \n\t" \
|
|
444 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 2304 <= l < 2560 */ \
|
|
445 |
" LAY %[ta],256(0,%[ta]) \n\t" \
|
|
446 |
" LA %[fa],256(0,%[fa]) \n\t" \
|
|
447 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 2048 <= l < 2304 */ \
|
|
448 |
" LAY %[ta],256(0,%[ta]) \n\t" \
|
|
449 |
" LA %[fa],256(0,%[fa]) \n\t" \
|
|
450 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 1792 <= l < 2048 */ \
|
|
451 |
" LAY %[ta],256(0,%[ta]) \n\t" \
|
|
452 |
" LA %[fa],256(0,%[fa]) \n\t" \
|
|
453 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 1536 <= l < 1792 */ \
|
|
454 |
" LAY %[ta],256(0,%[ta]) \n\t" \
|
|
455 |
" LA %[fa],256(0,%[fa]) \n\t" \
|
|
456 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 1280 <= l < 1536 */ \
|
|
457 |
" LAY %[ta],256(0,%[ta]) \n\t" \
|
|
458 |
" LA %[fa],256(0,%[fa]) \n\t" \
|
|
459 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 1024 <= l < 1280 */ \
|
|
460 |
" LAY %[ta],256(0,%[ta]) \n\t" \
|
|
461 |
" LA %[fa],256(0,%[fa]) \n\t" \
|
|
462 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 768 <= l < 1024 */ \
|
|
463 |
" LAY %[ta],256(0,%[ta]) \n\t" \
|
|
464 |
" LA %[fa],256(0,%[fa]) \n\t" \
|
|
465 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 512 <= l < 768 */ \
|
|
466 |
" LAY %[ta],256(0,%[ta]) \n\t" \
|
|
467 |
" LA %[fa],256(0,%[fa]) \n\t" \
|
|
468 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 256 <= l < 512 */ \
|
|
469 |
" LAY %[ta],256(0,%[ta]) \n\t" \
|
|
470 |
" LA %[fa],256(0,%[fa]) \n\t" \
|
|
471 |
"1: BCR 0,0 \n\t" /* nop as branch target */ \
|
|
472 |
: [to] "+Q" (_to) /* outputs */ \
|
|
473 |
, [from] "+Q" (_from) \
|
|
474 |
, [ta] "=a" (toaddr) \
|
|
475 |
, [fa] "=a" (fromaddr) \
|
|
476 |
, [mta] "=a" (movetable) \
|
|
477 |
, [nby] "+a" (_n_bytes) \
|
|
478 |
, [len] "=a" (len) \
|
|
479 |
: \
|
|
480 |
: "cc" /* clobbered */ \
|
|
481 |
); \
|
|
482 |
}
|
|
483 |
|
|
484 |
#define MVCLE_MEMCOPY(_to,_from,_len) \
|
|
485 |
asm( \
|
|
486 |
" LG 0,%[to] \n\t" /* address of to area */ \
|
|
487 |
" LG 2,%[from] \n\t" /* address of from area */ \
|
|
488 |
" LGR 1,%[len] \n\t" /* len of to area */ \
|
|
489 |
" LGR 3,%[len] \n\t" /* len of from area */ \
|
|
490 |
"1: MVCLE 0,2,176 \n\t" /* copy storage, bypass cache (0xb0) */ \
|
|
491 |
" BRC 1,1b \n\t" /* retry if interrupted */ \
|
|
492 |
: [to] "+Q" (_to) /* outputs */ \
|
|
493 |
, [from] "+Q" (_from) /* outputs */ \
|
|
494 |
: [len] "r" (_len) /* inputs */ \
|
|
495 |
: "cc", "r0", "r1", "r2", "r3" /* clobbered */ \
|
|
496 |
);
|
|
497 |
|
|
498 |
#define MVCLE_MEMINIT(_to,_val,_len) \
|
|
499 |
asm( \
|
|
500 |
" LG 0,%[to] \n\t" /* address of to area */ \
|
|
501 |
" LGR 1,%[len] \n\t" /* len of to area */ \
|
|
502 |
" XGR 3,3 \n\t" /* from area len = 0 */ \
|
|
503 |
"1: MVCLE 0,2,0(%[val]) \n\t" /* init storage */ \
|
|
504 |
" BRC 1,1b \n\t" /* retry if interrupted */ \
|
|
505 |
: [to] "+Q" (_to) /* outputs */ \
|
|
506 |
: [len] "r" (_len) /* inputs */ \
|
|
507 |
, [val] "r" (_val) /* inputs */ \
|
|
508 |
: "cc", "r0", "r1", "r3" /* clobbered */ \
|
|
509 |
);
|
|
510 |
#define MVCLE_MEMZERO(_to,_len) \
|
|
511 |
asm( \
|
|
512 |
" LG 0,%[to] \n\t" /* address of to area */ \
|
|
513 |
" LGR 1,%[len] \n\t" /* len of to area */ \
|
|
514 |
" XGR 3,3 \n\t" /* from area len = 0 */ \
|
|
515 |
"1: MVCLE 0,2,0 \n\t" /* clear storage */ \
|
|
516 |
" BRC 1,1b \n\t" /* retry if interrupted */ \
|
|
517 |
: [to] "+Q" (_to) /* outputs */ \
|
|
518 |
: [len] "r" (_len) /* inputs */ \
|
|
519 |
: "cc", "r0", "r1", "r3" /* clobbered */ \
|
|
520 |
);
|
|
521 |
|
|
522 |
// Clear a stretch of memory, 0 <= _len <= 256.
|
|
523 |
// There is no alignment prereq.
|
|
524 |
// There is no test for len out of range specified above.
|
|
525 |
#define XC_MEMZERO_256(_to,_len) \
|
|
526 |
{ unsigned long toaddr; unsigned long tolen; \
|
|
527 |
unsigned long target; \
|
|
528 |
asm("\t" \
|
|
529 |
" LTGR %[tolen],%[len] \n\t" /* decr for MVC */ \
|
|
530 |
" BRC 8,2f \n\t" /* do nothing for l=0*/ \
|
|
531 |
" AGHI %[tolen],-1 \n\t" /* adjust for EX XC */ \
|
|
532 |
" LARL %[target],1f \n\t" /* addr of XC instr */ \
|
|
533 |
" LG %[toaddr],%[to] \n\t" /* addr of data area */ \
|
|
534 |
" EX %[tolen],0(%[target]) \n\t" /* execute MVC instr */ \
|
|
535 |
" BRC 15,2f \n\t" /* skip template */ \
|
|
536 |
"1: XC 0(1,%[toaddr]),0(%[toaddr]) \n\t" \
|
|
537 |
"2: BCR 0,0 \n\t" /* nop a branch target*/\
|
|
538 |
: [to] "+Q" (_to) /* outputs */ \
|
|
539 |
, [tolen] "=a" (tolen) \
|
|
540 |
, [toaddr] "=a" (toaddr) \
|
|
541 |
, [target] "=a" (target) \
|
|
542 |
: [len] "r" (_len) /* inputs */ \
|
|
543 |
: "cc" /* clobbered */ \
|
|
544 |
); \
|
|
545 |
}
|
|
546 |
|
|
547 |
// Clear a stretch of memory, 256 < _len.
|
|
548 |
// XC_MEMZERO_256 may be used to clear shorter areas.
|
|
549 |
//
|
|
550 |
// The code
|
|
551 |
// - first zeroes a few bytes to align on a HeapWord.
|
|
552 |
// This step is currently inactive because all calls seem
|
|
553 |
// to have their data aligned on HeapWord boundaries.
|
|
554 |
// - then zeroes a few HeapWords to align on a cache line.
|
|
555 |
// - then zeroes entire cache lines in a loop.
|
|
556 |
// - then zeroes the remaining (partial) cache line.
|
|
557 |
#if 1
|
|
558 |
#define XC_MEMZERO_ANY(_to,_len) \
|
|
559 |
{ unsigned long toaddr; unsigned long tolen; \
|
|
560 |
unsigned long len8; unsigned long len256; \
|
|
561 |
unsigned long target; unsigned long lenx; \
|
|
562 |
asm("\t" \
|
|
563 |
" LTGR %[tolen],%[len] \n\t" /* */ \
|
|
564 |
" BRC 8,2f \n\t" /* do nothing for l=0*/ \
|
|
565 |
" LG %[toaddr],%[to] \n\t" /* addr of data area */ \
|
|
566 |
" LARL %[target],1f \n\t" /* addr of XC instr */ \
|
|
567 |
" " \
|
|
568 |
" LCGR %[len256],%[toaddr] \n\t" /* cache line alignment */\
|
|
569 |
" NILL %[len256],0xff \n\t" \
|
|
570 |
" BRC 8,4f \n\t" /* already aligned */ \
|
|
571 |
" NILH %[len256],0x00 \n\t" /* zero extend */ \
|
|
572 |
" LLGFR %[len256],%[len256] \n\t" \
|
|
573 |
" LAY %[lenx],-1(,%[len256]) \n\t" \
|
|
574 |
" EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \
|
|
575 |
" LA %[toaddr],0(%[len256],%[toaddr]) \n\t" \
|
|
576 |
" SGR %[tolen],%[len256] \n\t" /* adjust len */ \
|
|
577 |
" " \
|
|
578 |
"4: SRAG %[lenx],%[tolen],8 \n\t" /* # cache lines */ \
|
|
579 |
" BRC 8,6f \n\t" /* no full cache lines */ \
|
|
580 |
"5: XC 0(256,%[toaddr]),0(%[toaddr]) \n\t" \
|
|
581 |
" LA %[toaddr],256(,%[toaddr]) \n\t" \
|
|
582 |
" BRCTG %[lenx],5b \n\t" /* iterate */ \
|
|
583 |
" " \
|
|
584 |
"6: NILL %[tolen],0xff \n\t" /* leftover bytes */ \
|
|
585 |
" BRC 8,2f \n\t" /* done if none */ \
|
|
586 |
" LAY %[lenx],-1(,%[tolen]) \n\t" \
|
|
587 |
" EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \
|
|
588 |
" BRC 15,2f \n\t" /* skip template */ \
|
|
589 |
" " \
|
|
590 |
"1: XC 0(1,%[toaddr]),0(%[toaddr]) \n\t" \
|
|
591 |
"2: BCR 0,0 \n\t" /* nop a branch target */ \
|
|
592 |
: [to] "+Q" (_to) /* outputs */ \
|
|
593 |
, [lenx] "=a" (lenx) \
|
|
594 |
, [len256] "=a" (len256) \
|
|
595 |
, [tolen] "=a" (tolen) \
|
|
596 |
, [toaddr] "=a" (toaddr) \
|
|
597 |
, [target] "=a" (target) \
|
|
598 |
: [len] "r" (_len) /* inputs */ \
|
|
599 |
: "cc" /* clobbered */ \
|
|
600 |
); \
|
|
601 |
}
|
|
602 |
#else
|
|
603 |
#define XC_MEMZERO_ANY(_to,_len) \
|
|
604 |
{ unsigned long toaddr; unsigned long tolen; \
|
|
605 |
unsigned long len8; unsigned long len256; \
|
|
606 |
unsigned long target; unsigned long lenx; \
|
|
607 |
asm("\t" \
|
|
608 |
" LTGR %[tolen],%[len] \n\t" /* */ \
|
|
609 |
" BRC 8,2f \n\t" /* do nothing for l=0*/ \
|
|
610 |
" LG %[toaddr],%[to] \n\t" /* addr of data area */ \
|
|
611 |
" LARL %[target],1f \n\t" /* addr of XC instr */ \
|
|
612 |
" " \
|
|
613 |
" LCGR %[len8],%[toaddr] \n\t" /* HeapWord alignment */ \
|
|
614 |
" NILL %[len8],0x07 \n\t" \
|
|
615 |
" BRC 8,3f \n\t" /* already aligned */ \
|
|
616 |
" NILH %[len8],0x00 \n\t" /* zero extend */ \
|
|
617 |
" LLGFR %[len8],%[len8] \n\t" \
|
|
618 |
" LAY %[lenx],-1(,%[len8]) \n\t" \
|
|
619 |
" EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \
|
|
620 |
" LA %[toaddr],0(%[len8],%[toaddr]) \n\t" \
|
|
621 |
" SGR %[tolen],%[len8] \n\t" /* adjust len */ \
|
|
622 |
" " \
|
|
623 |
"3: LCGR %[len256],%[toaddr] \n\t" /* cache line alignment */\
|
|
624 |
" NILL %[len256],0xff \n\t" \
|
|
625 |
" BRC 8,4f \n\t" /* already aligned */ \
|
|
626 |
" NILH %[len256],0x00 \n\t" /* zero extend */ \
|
|
627 |
" LLGFR %[len256],%[len256] \n\t" \
|
|
628 |
" LAY %[lenx],-1(,%[len256]) \n\t" \
|
|
629 |
" EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \
|
|
630 |
" LA %[toaddr],0(%[len256],%[toaddr]) \n\t" \
|
|
631 |
" SGR %[tolen],%[len256] \n\t" /* adjust len */ \
|
|
632 |
" " \
|
|
633 |
"4: SRAG %[lenx],%[tolen],8 \n\t" /* # cache lines */ \
|
|
634 |
" BRC 8,6f \n\t" /* no full cache lines */ \
|
|
635 |
"5: XC 0(256,%[toaddr]),0(%[toaddr]) \n\t" \
|
|
636 |
" LA %[toaddr],256(,%[toaddr]) \n\t" \
|
|
637 |
" BRCTG %[lenx],5b \n\t" /* iterate */ \
|
|
638 |
" " \
|
|
639 |
"6: NILL %[tolen],0xff \n\t" /* leftover bytes */ \
|
|
640 |
" BRC 8,2f \n\t" /* done if none */ \
|
|
641 |
" LAY %[lenx],-1(,%[tolen]) \n\t" \
|
|
642 |
" EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \
|
|
643 |
" BRC 15,2f \n\t" /* skip template */ \
|
|
644 |
" " \
|
|
645 |
"1: XC 0(1,%[toaddr]),0(%[toaddr]) \n\t" \
|
|
646 |
"2: BCR 0,0 \n\t" /* nop a branch target */ \
|
|
647 |
: [to] "+Q" (_to) /* outputs */ \
|
|
648 |
, [lenx] "=a" (lenx) \
|
|
649 |
, [len8] "=a" (len8) \
|
|
650 |
, [len256] "=a" (len256) \
|
|
651 |
, [tolen] "=a" (tolen) \
|
|
652 |
, [toaddr] "=a" (toaddr) \
|
|
653 |
, [target] "=a" (target) \
|
|
654 |
: [len] "r" (_len) /* inputs */ \
|
|
655 |
: "cc" /* clobbered */ \
|
|
656 |
); \
|
|
657 |
}
|
|
658 |
#endif
|
|
659 |
#endif // USE_INLINE_ASM
|
|
660 |
|
|
661 |
//*************************************//
|
|
662 |
// D I S J O I N T C O P Y I N G //
|
|
663 |
//*************************************//
|
|
664 |
|
|
665 |
static void pd_aligned_disjoint_words(HeapWord* from, HeapWord* to, size_t count) {
|
|
666 |
// JVM2008: very frequent, some tests frequent.
|
|
667 |
|
|
668 |
// Copy HeapWord (=DW) aligned storage. Use MVCLE in inline-asm code.
|
|
669 |
// MVCLE guarantees DW concurrent (i.e. atomic) accesses if both the addresses of the operands
|
|
670 |
// are DW aligned and the length is an integer multiple of a DW. Should always be true here.
|
|
671 |
//
|
|
672 |
// No special exploit needed. H/W discovers suitable situations itself.
|
|
673 |
//
|
|
674 |
// For large chunks of memory, exploit special H/W support of z/Architecture:
|
|
675 |
// 1) copy short piece of memory to page-align address(es)
|
|
676 |
// 2) copy largest part (all contained full pages) of memory using mvcle instruction.
|
|
677 |
// z/Architecture processors have special H/W support for page-aligned storage
|
|
678 |
// where len is an int multiple of page size. In that case, up to 4 cache lines are
|
|
679 |
// processed in parallel and L1 cache is not polluted.
|
|
680 |
// 3) copy the remaining piece of memory.
|
|
681 |
//
|
|
682 |
#ifdef USE_INLINE_ASM
|
|
683 |
jbyte* to_bytes = (jbyte*)to;
|
|
684 |
jbyte* from_bytes = (jbyte*)from;
|
|
685 |
size_t len_bytes = count*HeapWordSize;
|
|
686 |
|
|
687 |
// Optimized copying for data less than 4k
|
|
688 |
switch (count) {
|
|
689 |
case 0: return;
|
|
690 |
case 1: MOVE8_ATOMIC_1(to,from)
|
|
691 |
return;
|
|
692 |
case 2: MOVE8_ATOMIC_2(to,from)
|
|
693 |
return;
|
|
694 |
// case 3: MOVE8_ATOMIC_3(to,from)
|
|
695 |
// return;
|
|
696 |
// case 4: MOVE8_ATOMIC_4(to,from)
|
|
697 |
// return;
|
|
698 |
default:
|
|
699 |
if (len_bytes <= 4096) {
|
|
700 |
MVC_MULTI(to,from,len_bytes)
|
|
701 |
return;
|
|
702 |
}
|
|
703 |
// else
|
|
704 |
MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
|
|
705 |
return;
|
|
706 |
}
|
|
707 |
#else
|
|
708 |
// Fallback code.
|
|
709 |
switch (count) {
|
|
710 |
case 0:
|
|
711 |
return;
|
|
712 |
|
|
713 |
case 1:
|
|
714 |
*to = *from;
|
|
715 |
return;
|
|
716 |
|
|
717 |
case 2:
|
|
718 |
*to++ = *from++;
|
|
719 |
*to = *from;
|
|
720 |
return;
|
|
721 |
|
|
722 |
case 3:
|
|
723 |
*to++ = *from++;
|
|
724 |
*to++ = *from++;
|
|
725 |
*to = *from;
|
|
726 |
return;
|
|
727 |
|
|
728 |
case 4:
|
|
729 |
*to++ = *from++;
|
|
730 |
*to++ = *from++;
|
|
731 |
*to++ = *from++;
|
|
732 |
*to = *from;
|
|
733 |
return;
|
|
734 |
|
|
735 |
default:
|
|
736 |
while (count-- > 0)
|
|
737 |
*(to++) = *(from++);
|
|
738 |
return;
|
|
739 |
}
|
|
740 |
#endif
|
|
741 |
}
|
|
742 |
|
|
743 |
static void pd_disjoint_words_atomic(HeapWord* from, HeapWord* to, size_t count) {
|
|
744 |
// JVM2008: < 4k calls.
|
|
745 |
assert(((((size_t)from) & 0x07L) | (((size_t)to) & 0x07L)) == 0, "No atomic copy w/o aligned data");
|
|
746 |
pd_aligned_disjoint_words(from, to, count); // Rare calls -> just delegate.
|
|
747 |
}
|
|
748 |
|
|
749 |
static void pd_disjoint_words(HeapWord* from, HeapWord* to, size_t count) {
|
|
750 |
// JVM2008: very rare.
|
|
751 |
pd_aligned_disjoint_words(from, to, count); // Rare calls -> just delegate.
|
|
752 |
}
|
|
753 |
|
|
754 |
|
|
755 |
//*************************************//
|
|
756 |
// C O N J O I N T C O P Y I N G //
|
|
757 |
//*************************************//
|
|
758 |
|
|
759 |
static void pd_aligned_conjoint_words(HeapWord* from, HeapWord* to, size_t count) {
|
|
760 |
// JVM2008: between some and lower end of frequent.
|
|
761 |
|
|
762 |
#ifdef USE_INLINE_ASM
|
|
763 |
size_t count_in = count;
|
|
764 |
if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) {
|
|
765 |
switch (count_in) {
|
|
766 |
case 4: COPY8_ATOMIC_4(to,from)
|
|
767 |
return;
|
|
768 |
case 3: COPY8_ATOMIC_3(to,from)
|
|
769 |
return;
|
|
770 |
case 2: COPY8_ATOMIC_2(to,from)
|
|
771 |
return;
|
|
772 |
case 1: COPY8_ATOMIC_1(to,from)
|
|
773 |
return;
|
|
774 |
case 0: return;
|
|
775 |
default:
|
|
776 |
from += count_in;
|
|
777 |
to += count_in;
|
|
778 |
while (count_in-- > 0)
|
|
779 |
*(--to) = *(--from); // Copy backwards, areas overlap destructively.
|
|
780 |
return;
|
|
781 |
}
|
|
782 |
}
|
|
783 |
// else
|
|
784 |
jbyte* to_bytes = (jbyte*)to;
|
|
785 |
jbyte* from_bytes = (jbyte*)from;
|
|
786 |
size_t len_bytes = count_in*BytesPerLong;
|
|
787 |
MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
|
|
788 |
return;
|
|
789 |
#else
|
|
790 |
// Fallback code.
|
|
791 |
if (has_destructive_overlap((char*)from, (char*)to, count*BytesPerLong)) {
|
|
792 |
HeapWord t1, t2, t3;
|
|
793 |
switch (count) {
|
|
794 |
case 0:
|
|
795 |
return;
|
|
796 |
|
|
797 |
case 1:
|
|
798 |
*to = *from;
|
|
799 |
return;
|
|
800 |
|
|
801 |
case 2:
|
|
802 |
t1 = *(from+1);
|
|
803 |
*to = *from;
|
|
804 |
*(to+1) = t1;
|
|
805 |
return;
|
|
806 |
|
|
807 |
case 3:
|
|
808 |
t1 = *(from+1);
|
|
809 |
t2 = *(from+2);
|
|
810 |
*to = *from;
|
|
811 |
*(to+1) = t1;
|
|
812 |
*(to+2) = t2;
|
|
813 |
return;
|
|
814 |
|
|
815 |
case 4:
|
|
816 |
t1 = *(from+1);
|
|
817 |
t2 = *(from+2);
|
|
818 |
t3 = *(from+3);
|
|
819 |
*to = *from;
|
|
820 |
*(to+1) = t1;
|
|
821 |
*(to+2) = t2;
|
|
822 |
*(to+3) = t3;
|
|
823 |
return;
|
|
824 |
|
|
825 |
default:
|
|
826 |
from += count;
|
|
827 |
to += count;
|
|
828 |
while (count-- > 0)
|
|
829 |
*(--to) = *(--from); // Copy backwards, areas overlap destructively.
|
|
830 |
return;
|
|
831 |
}
|
|
832 |
}
|
|
833 |
// else
|
|
834 |
// Just delegate. HeapWords are optimally aligned anyway.
|
|
835 |
pd_aligned_disjoint_words(from, to, count);
|
|
836 |
#endif
|
|
837 |
}
|
|
838 |
|
|
839 |
static void pd_conjoint_words(HeapWord* from, HeapWord* to, size_t count) {
|
|
840 |
|
|
841 |
// Just delegate. HeapWords are optimally aligned anyway.
|
|
842 |
pd_aligned_conjoint_words(from, to, count);
|
|
843 |
}
|
|
844 |
|
|
845 |
static void pd_conjoint_bytes(void* from, void* to, size_t count) {
|
|
846 |
|
|
847 |
#ifdef USE_INLINE_ASM
|
|
848 |
size_t count_in = count;
|
|
849 |
if (has_destructive_overlap((char*)from, (char*)to, count_in))
|
|
850 |
(void)memmove(to, from, count_in);
|
|
851 |
else {
|
|
852 |
jbyte* to_bytes = (jbyte*)to;
|
|
853 |
jbyte* from_bytes = (jbyte*)from;
|
|
854 |
size_t len_bytes = count_in;
|
|
855 |
MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
|
|
856 |
}
|
|
857 |
#else
|
|
858 |
if (has_destructive_overlap((char*)from, (char*)to, count))
|
|
859 |
(void)memmove(to, from, count);
|
|
860 |
else
|
|
861 |
(void)memcpy(to, from, count);
|
|
862 |
#endif
|
|
863 |
}
|
|
864 |
|
|
865 |
//**************************************************//
|
|
866 |
// C O N J O I N T A T O M I C C O P Y I N G //
|
|
867 |
//**************************************************//
|
|
868 |
|
|
869 |
static void pd_conjoint_bytes_atomic(void* from, void* to, size_t count) {
|
|
870 |
// Call arraycopy stubs to do the job.
|
|
871 |
pd_conjoint_bytes(from, to, count); // bytes are always accessed atomically.
|
|
872 |
}
|
|
873 |
|
|
874 |
static void pd_conjoint_jshorts_atomic(jshort* from, jshort* to, size_t count) {
|
|
875 |
|
|
876 |
#ifdef USE_INLINE_ASM
|
|
877 |
size_t count_in = count;
|
|
878 |
if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerShort)) {
|
|
879 |
// Use optimizations from shared code where no z-specific optimization exists.
|
|
880 |
copy_conjoint_jshorts_atomic(from, to, count);
|
|
881 |
} else {
|
|
882 |
jbyte* to_bytes = (jbyte*)to;
|
|
883 |
jbyte* from_bytes = (jbyte*)from;
|
|
884 |
size_t len_bytes = count_in*BytesPerShort;
|
|
885 |
MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
|
|
886 |
}
|
|
887 |
#else
|
|
888 |
// Use optimizations from shared code where no z-specific optimization exists.
|
|
889 |
copy_conjoint_jshorts_atomic(from, to, count);
|
|
890 |
#endif
|
|
891 |
}
|
|
892 |
|
|
893 |
static void pd_conjoint_jints_atomic(jint* from, jint* to, size_t count) {
|
|
894 |
|
|
895 |
#ifdef USE_INLINE_ASM
|
|
896 |
size_t count_in = count;
|
|
897 |
if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerInt)) {
|
|
898 |
switch (count_in) {
|
|
899 |
case 4: COPY4_ATOMIC_4(to,from)
|
|
900 |
return;
|
|
901 |
case 3: COPY4_ATOMIC_3(to,from)
|
|
902 |
return;
|
|
903 |
case 2: COPY4_ATOMIC_2(to,from)
|
|
904 |
return;
|
|
905 |
case 1: COPY4_ATOMIC_1(to,from)
|
|
906 |
return;
|
|
907 |
case 0: return;
|
|
908 |
default:
|
|
909 |
// Use optimizations from shared code where no z-specific optimization exists.
|
|
910 |
copy_conjoint_jints_atomic(from, to, count_in);
|
|
911 |
return;
|
|
912 |
}
|
|
913 |
}
|
|
914 |
// else
|
|
915 |
jbyte* to_bytes = (jbyte*)to;
|
|
916 |
jbyte* from_bytes = (jbyte*)from;
|
|
917 |
size_t len_bytes = count_in*BytesPerInt;
|
|
918 |
MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
|
|
919 |
#else
|
|
920 |
// Use optimizations from shared code where no z-specific optimization exists.
|
|
921 |
copy_conjoint_jints_atomic(from, to, count);
|
|
922 |
#endif
|
|
923 |
}
|
|
924 |
|
|
925 |
static void pd_conjoint_jlongs_atomic(jlong* from, jlong* to, size_t count) {
|
|
926 |
|
|
927 |
#ifdef USE_INLINE_ASM
|
|
928 |
size_t count_in = count;
|
|
929 |
if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) {
|
|
930 |
switch (count_in) {
|
|
931 |
case 4: COPY8_ATOMIC_4(to,from) return;
|
|
932 |
case 3: COPY8_ATOMIC_3(to,from) return;
|
|
933 |
case 2: COPY8_ATOMIC_2(to,from) return;
|
|
934 |
case 1: COPY8_ATOMIC_1(to,from) return;
|
|
935 |
case 0: return;
|
|
936 |
default:
|
|
937 |
from += count_in;
|
|
938 |
to += count_in;
|
|
939 |
while (count_in-- > 0) { *(--to) = *(--from); } // Copy backwards, areas overlap destructively.
|
|
940 |
return;
|
|
941 |
}
|
|
942 |
}
|
|
943 |
// else {
|
|
944 |
jbyte* to_bytes = (jbyte*)to;
|
|
945 |
jbyte* from_bytes = (jbyte*)from;
|
|
946 |
size_t len_bytes = count_in*BytesPerLong;
|
|
947 |
MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
|
|
948 |
#else
|
|
949 |
size_t count_in = count;
|
|
950 |
if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) {
|
|
951 |
if (count_in < 8) {
|
|
952 |
from += count_in;
|
|
953 |
to += count_in;
|
|
954 |
while (count_in-- > 0)
|
|
955 |
*(--to) = *(--from); // Copy backwards, areas overlap destructively.
|
|
956 |
return;
|
|
957 |
}
|
|
958 |
// else {
|
|
959 |
from += count_in-1;
|
|
960 |
to += count_in-1;
|
|
961 |
if (count_in&0x01) {
|
|
962 |
*(to--) = *(from--);
|
|
963 |
count_in--;
|
|
964 |
}
|
|
965 |
for (; count_in>0; count_in-=2) {
|
|
966 |
*to = *from;
|
|
967 |
*(to-1) = *(from-1);
|
|
968 |
to -= 2;
|
|
969 |
from -= 2;
|
|
970 |
}
|
|
971 |
}
|
|
972 |
else
|
|
973 |
pd_aligned_disjoint_words((HeapWord*)from, (HeapWord*)to, count_in); // rare calls -> just delegate.
|
|
974 |
#endif
|
|
975 |
}
|
|
976 |
|
|
977 |
static void pd_conjoint_oops_atomic(oop* from, oop* to, size_t count) {
|
|
978 |
|
|
979 |
#ifdef USE_INLINE_ASM
|
|
980 |
size_t count_in = count;
|
|
981 |
if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerOop)) {
|
|
982 |
switch (count_in) {
|
|
983 |
case 4: COPY8_ATOMIC_4(to,from) return;
|
|
984 |
case 3: COPY8_ATOMIC_3(to,from) return;
|
|
985 |
case 2: COPY8_ATOMIC_2(to,from) return;
|
|
986 |
case 1: COPY8_ATOMIC_1(to,from) return;
|
|
987 |
case 0: return;
|
|
988 |
default:
|
|
989 |
from += count_in;
|
|
990 |
to += count_in;
|
|
991 |
while (count_in-- > 0) { *(--to) = *(--from); } // Copy backwards, areas overlap destructively.
|
|
992 |
return;
|
|
993 |
}
|
|
994 |
}
|
|
995 |
// else
|
|
996 |
jbyte* to_bytes = (jbyte*)to;
|
|
997 |
jbyte* from_bytes = (jbyte*)from;
|
|
998 |
size_t len_bytes = count_in*BytesPerOop;
|
|
999 |
MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes)
|
|
1000 |
#else
|
|
1001 |
size_t count_in = count;
|
|
1002 |
if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerOop)) {
|
|
1003 |
from += count_in;
|
|
1004 |
to += count_in;
|
|
1005 |
while (count_in-- > 0) *(--to) = *(--from); // Copy backwards, areas overlap destructively.
|
|
1006 |
return;
|
|
1007 |
}
|
|
1008 |
// else
|
|
1009 |
pd_aligned_disjoint_words((HeapWord*)from, (HeapWord*)to, count_in); // rare calls -> just delegate.
|
|
1010 |
return;
|
|
1011 |
#endif
|
|
1012 |
}
|
|
1013 |
|
|
1014 |
static void pd_arrayof_conjoint_bytes(HeapWord* from, HeapWord* to, size_t count) {
|
|
1015 |
pd_conjoint_bytes_atomic(from, to, count);
|
|
1016 |
}
|
|
1017 |
|
|
1018 |
static void pd_arrayof_conjoint_jshorts(HeapWord* from, HeapWord* to, size_t count) {
|
|
1019 |
pd_conjoint_jshorts_atomic((jshort*)from, (jshort*)to, count);
|
|
1020 |
}
|
|
1021 |
|
|
1022 |
static void pd_arrayof_conjoint_jints(HeapWord* from, HeapWord* to, size_t count) {
|
|
1023 |
pd_conjoint_jints_atomic((jint*)from, (jint*)to, count);
|
|
1024 |
}
|
|
1025 |
|
|
1026 |
static void pd_arrayof_conjoint_jlongs(HeapWord* from, HeapWord* to, size_t count) {
|
|
1027 |
pd_conjoint_jlongs_atomic((jlong*)from, (jlong*)to, count);
|
|
1028 |
}
|
|
1029 |
|
|
1030 |
static void pd_arrayof_conjoint_oops(HeapWord* from, HeapWord* to, size_t count) {
|
|
1031 |
pd_conjoint_oops_atomic((oop*)from, (oop*)to, count);
|
|
1032 |
}
|
|
1033 |
|
|
1034 |
//**********************************************//
|
|
1035 |
// M E M O R Y I N I T I A L I S A T I O N //
|
|
1036 |
//**********************************************//
|
|
1037 |
|
|
1038 |
static void pd_fill_to_bytes(void* to, size_t count, jubyte value) {
|
|
1039 |
// JVM2008: very rare, only in some tests.
|
|
1040 |
#ifdef USE_INLINE_ASM
|
|
1041 |
// Initialize storage to a given value. Use memset instead of copy loop.
|
|
1042 |
// For large chunks of memory, exploit special H/W support of z/Architecture:
|
|
1043 |
// 1) init short piece of memory to page-align address
|
|
1044 |
// 2) init largest part (all contained full pages) of memory using mvcle instruction.
|
|
1045 |
// z/Architecture processors have special H/W support for page-aligned storage
|
|
1046 |
// where len is an int multiple of page size. In that case, up to 4 cache lines are
|
|
1047 |
// processed in parallel and L1 cache is not polluted.
|
|
1048 |
// 3) init the remaining piece of memory.
|
|
1049 |
// Atomicity cannot really be an issue since gcc implements the loop body with XC anyway.
|
|
1050 |
// If atomicity is a problem, we have to prevent gcc optimization. Best workaround: inline asm.
|
|
1051 |
|
|
1052 |
jbyte* to_bytes = (jbyte*)to;
|
|
1053 |
size_t len_bytes = count;
|
|
1054 |
|
|
1055 |
MVCLE_MEMINIT(to_bytes, value, len_bytes)
|
|
1056 |
|
|
1057 |
#else
|
|
1058 |
// Memset does the best job possible: loop over 256-byte MVCs, with
|
|
1059 |
// the last MVC EXecuted. With the -mmvcle option, initialization
|
|
1060 |
// is done using MVCLE -> slight advantage for large areas.
|
|
1061 |
(void)memset(to, value, count);
|
|
1062 |
#endif
|
|
1063 |
}
|
|
1064 |
|
|
1065 |
static void pd_fill_to_words(HeapWord* tohw, size_t count, juint value) {
|
|
1066 |
// Occurs in dbg builds only. Usually memory poisoning with BAADBABE, DEADBEEF, etc.
|
|
1067 |
// JVM2008: < 4k calls.
|
|
1068 |
if (value == 0) {
|
|
1069 |
pd_zero_to_words(tohw, count);
|
|
1070 |
return;
|
|
1071 |
}
|
|
1072 |
if (value == ~(juint)(0)) {
|
|
1073 |
pd_fill_to_bytes(tohw, count*HeapWordSize, (jubyte)(~(juint)(0)));
|
|
1074 |
return;
|
|
1075 |
}
|
|
1076 |
julong* to = (julong*) tohw;
|
|
1077 |
julong v = ((julong) value << 32) | value;
|
|
1078 |
while (count-- > 0) {
|
|
1079 |
*to++ = v;
|
|
1080 |
}
|
|
1081 |
}
|
|
1082 |
|
|
1083 |
static void pd_fill_to_aligned_words(HeapWord* tohw, size_t count, juint value) {
|
|
1084 |
// JVM2008: very frequent, but virtually all calls are with value == 0.
|
|
1085 |
pd_fill_to_words(tohw, count, value);
|
|
1086 |
}
|
|
1087 |
|
|
1088 |
//**********************************//
|
|
1089 |
// M E M O R Y C L E A R I N G //
|
|
1090 |
//**********************************//
|
|
1091 |
|
|
1092 |
// Delegate to pd_zero_to_bytes. It also works HeapWord-atomic.
|
|
1093 |
// Distinguish between simple and large zero_to_words.
|
|
1094 |
static void pd_zero_to_words(HeapWord* tohw, size_t count) {
|
|
1095 |
pd_zero_to_bytes(tohw, count*HeapWordSize);
|
|
1096 |
}
|
|
1097 |
|
|
1098 |
// Delegate to pd_zero_to_bytes. It also works HeapWord-atomic.
|
|
1099 |
static void pd_zero_to_words_large(HeapWord* tohw, size_t count) {
|
|
1100 |
// JVM2008: generally frequent, some tests show very frequent calls.
|
|
1101 |
pd_zero_to_bytes(tohw, count*HeapWordSize);
|
|
1102 |
}
|
|
1103 |
|
|
1104 |
static void pd_zero_to_bytes(void* to, size_t count) {
|
|
1105 |
// JVM2008: some calls (generally), some tests frequent
|
|
1106 |
#ifdef USE_INLINE_ASM
|
|
1107 |
// Even zero_to_bytes() requires HeapWord-atomic, or, at least, sequential
|
|
1108 |
// zeroing of the memory. MVCLE is not fit for that job:
|
|
1109 |
// "As observed by other CPUs and by the channel subsystem,
|
|
1110 |
// that portion of the first operand which is filled
|
|
1111 |
// with the padding byte is not necessarily stored into in
|
|
1112 |
// a left-to-right direction and may appear to be stored
|
|
1113 |
// into more than once."
|
|
1114 |
// Therefore, implementation was changed to use (multiple) XC instructions.
|
|
1115 |
|
|
1116 |
const long line_size = 256;
|
|
1117 |
jbyte* to_bytes = (jbyte*)to;
|
|
1118 |
size_t len_bytes = count;
|
|
1119 |
|
|
1120 |
if (len_bytes <= line_size) {
|
|
1121 |
XC_MEMZERO_256(to_bytes, len_bytes);
|
|
1122 |
} else {
|
|
1123 |
XC_MEMZERO_ANY(to_bytes, len_bytes);
|
|
1124 |
}
|
|
1125 |
|
|
1126 |
#else
|
|
1127 |
// Memset does the best job possible: loop over 256-byte MVCs, with
|
|
1128 |
// the last MVC EXecuted. With the -mmvcle option, initialization
|
|
1129 |
// is done using MVCLE -> slight advantage for large areas.
|
|
1130 |
(void)memset(to, 0, count);
|
|
1131 |
#endif
|
|
1132 |
}
|
|
1133 |
|
|
1134 |
#endif // CPU_S390_VM_COPY_S390_HPP
|