author | herrick |
Thu, 17 Oct 2019 07:55:35 -0400 | |
branch | JDK-8200758-branch |
changeset 58670 | 6fb9e12d5595 |
parent 53244 | 9807daeb47c4 |
permissions | -rw-r--r-- |
42065 | 1 |
/* |
53244
9807daeb47c4
8216167: Update include guards to reflect correct directories
coleenp
parents:
48956
diff
changeset
|
2 |
* Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved. |
42065 | 3 |
* Copyright (c) 2016 SAP SE. All rights reserved. |
4 |
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
|
5 |
* |
|
6 |
* This code is free software; you can redistribute it and/or modify it |
|
7 |
* under the terms of the GNU General Public License version 2 only, as |
|
8 |
* published by the Free Software Foundation. |
|
9 |
* |
|
10 |
* This code is distributed in the hope that it will be useful, but WITHOUT |
|
11 |
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
12 |
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
13 |
* version 2 for more details (a copy is included in the LICENSE file that |
|
14 |
* accompanied this code). |
|
15 |
* |
|
16 |
* You should have received a copy of the GNU General Public License version |
|
17 |
* 2 along with this work; if not, write to the Free Software Foundation, |
|
18 |
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
19 |
* |
|
20 |
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
|
21 |
* or visit www.oracle.com if you need additional information or have any |
|
22 |
* questions. |
|
23 |
* |
|
24 |
*/ |
|
25 |
||
26 |
// Major contributions by LS |
|
27 |
||
53244
9807daeb47c4
8216167: Update include guards to reflect correct directories
coleenp
parents:
48956
diff
changeset
|
28 |
#ifndef CPU_S390_COPY_S390_HPP |
9807daeb47c4
8216167: Update include guards to reflect correct directories
coleenp
parents:
48956
diff
changeset
|
29 |
#define CPU_S390_COPY_S390_HPP |
42065 | 30 |
|
31 |
// Inline functions for memory copy and fill. |
|
32 |
||
33 |
// HeapWordSize (the size of class HeapWord) is 8 Bytes (the size of a |
|
34 |
// pointer variable), since we always run the _LP64 model. As a consequence, |
|
35 |
// HeapWord* memory ranges are always assumed to be doubleword-aligned, |
|
36 |
// having a size which is an integer multiple of HeapWordSize. |
|
37 |
// |
|
38 |
// Dealing only with doubleword-aligned doubleword units has important |
|
39 |
// positive performance and data access consequences. Many of the move |
|
40 |
// instructions perform particularly well under these circumstances. |
|
41 |
// Data access is "doubleword-concurrent", except for MVC and XC. |
|
42 |
// Furthermore, data access can be forced to be sequential (MVCL and MVCLE) |
|
43 |
// by use of the special padding byte 0xb1, where required. For copying, |
|
44 |
// we use padding byte 0xb0 to prevent the D-cache from being polluted. |
|
45 |
// |
|
46 |
// On z/Architecture, gcc optimizes memcpy into a series of MVC instructions. |
|
47 |
// This is optimal, even if just one HeapWord is copied. However, MVC |
|
48 |
// copying is not atomic, i.e. not "doubleword concurrent" by definition. |
|
49 |
// |
|
50 |
// If the -mmvcle compiler option is specified, memcpy translates into |
|
51 |
// code such that the entire memory range is copied or preset with just |
|
52 |
// one MVCLE instruction. |
|
53 |
// |
|
54 |
// *to = *from is transformed into a MVC instruction already with -O1. |
|
55 |
// Thus, for atomic copy operations, (inline) assembler code is required |
|
56 |
// to guarantee atomic data accesses. |
|
57 |
// |
|
58 |
// For large (len >= MVCLEThreshold) chunks of memory, we exploit |
|
59 |
// special H/W support of z/Architecture: |
|
60 |
// 1) copy short piece of memory to page-align address(es) |
|
61 |
// 2) copy largest part (all contained full pages) of memory using mvcle instruction. |
|
62 |
// z/Architecture processors have special H/W support for page-aligned storage |
|
63 |
// where len is an int multiple of page size. In that case, up to 4 cache lines are |
|
64 |
// processed in parallel and L1 cache is not polluted. |
|
65 |
// 3) copy the remaining piece of memory. |
|
66 |
// |
|
67 |
// Measurement classifications: |
|
68 |
// very rare - <= 10.000 calls AND <= 1.000 usec elapsed |
|
69 |
// rare - <= 100.000 calls AND <= 10.000 usec elapsed |
|
70 |
// some - <= 1.000.000 calls AND <= 100.000 usec elapsed |
|
71 |
// freq - <= 10.000.000 calls AND <= 1.000.000 usec elapsed |
|
72 |
// very freq - > 10.000.000 calls OR > 1.000.000 usec elapsed |
|
73 |
||
74 |
#undef USE_INLINE_ASM |
|
75 |
||
48951 | 76 |
static void copy_conjoint_jshorts_atomic(const jshort* from, jshort* to, size_t count) { |
42065 | 77 |
if (from > to) { |
78 |
while (count-- > 0) { |
|
79 |
// Copy forwards |
|
80 |
*to++ = *from++; |
|
81 |
} |
|
82 |
} else { |
|
83 |
from += count - 1; |
|
84 |
to += count - 1; |
|
85 |
while (count-- > 0) { |
|
86 |
// Copy backwards |
|
87 |
*to-- = *from--; |
|
88 |
} |
|
89 |
} |
|
90 |
} |
|
91 |
||
48951 | 92 |
static void copy_conjoint_jints_atomic(const jint* from, jint* to, size_t count) { |
42065 | 93 |
if (from > to) { |
94 |
while (count-- > 0) { |
|
95 |
// Copy forwards |
|
96 |
*to++ = *from++; |
|
97 |
} |
|
98 |
} else { |
|
99 |
from += count - 1; |
|
100 |
to += count - 1; |
|
101 |
while (count-- > 0) { |
|
102 |
// Copy backwards |
|
103 |
*to-- = *from--; |
|
104 |
} |
|
105 |
} |
|
106 |
} |
|
107 |
||
48951 | 108 |
static bool has_destructive_overlap(const char* from, char* to, size_t byte_count) { |
42065 | 109 |
return (from < to) && ((to-from) < (ptrdiff_t)byte_count); |
110 |
} |
|
111 |
||
112 |
#ifdef USE_INLINE_ASM |
|
113 |
||
114 |
//-------------------------------------------------------------- |
|
115 |
// Atomic copying. Atomicity is given by the minimum of source |
|
116 |
// and target alignment. Refer to mail comm with Tim Slegel/IBM. |
|
117 |
// Only usable for disjoint source and target. |
|
118 |
//-------------------------------------------------------------- |
|
119 |
#define MOVE8_ATOMIC_4(_to,_from) { \ |
|
120 |
unsigned long toaddr; \ |
|
121 |
unsigned long fromaddr; \ |
|
122 |
asm( \ |
|
123 |
"LG %[toaddr],%[to] \n\t" /* address of to area */ \ |
|
124 |
"LG %[fromaddr],%[from] \n\t" /* address of from area */ \ |
|
125 |
"MVC 0(32,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \ |
|
126 |
: [to] "+Q" (_to) /* outputs */ \ |
|
127 |
, [from] "+Q" (_from) \ |
|
128 |
, [toaddr] "=a" (toaddr) \ |
|
129 |
, [fromaddr] "=a" (fromaddr) \ |
|
130 |
: \ |
|
131 |
: "cc" /* clobbered */ \ |
|
132 |
); \ |
|
133 |
} |
|
134 |
#define MOVE8_ATOMIC_3(_to,_from) { \ |
|
135 |
unsigned long toaddr; \ |
|
136 |
unsigned long fromaddr; \ |
|
137 |
asm( \ |
|
138 |
"LG %[toaddr],%[to] \n\t" /* address of to area */ \ |
|
139 |
"LG %[fromaddr],%[from] \n\t" /* address of from area */ \ |
|
140 |
"MVC 0(24,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \ |
|
141 |
: [to] "+Q" (_to) /* outputs */ \ |
|
142 |
, [from] "+Q" (_from) \ |
|
143 |
, [toaddr] "=a" (toaddr) \ |
|
144 |
, [fromaddr] "=a" (fromaddr) \ |
|
145 |
: \ |
|
146 |
: "cc" /* clobbered */ \ |
|
147 |
); \ |
|
148 |
} |
|
149 |
#define MOVE8_ATOMIC_2(_to,_from) { \ |
|
150 |
unsigned long toaddr; \ |
|
151 |
unsigned long fromaddr; \ |
|
152 |
asm( \ |
|
153 |
"LG %[toaddr],%[to] \n\t" /* address of to area */ \ |
|
154 |
"LG %[fromaddr],%[from] \n\t" /* address of from area */ \ |
|
155 |
"MVC 0(16,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \ |
|
156 |
: [to] "+Q" (_to) /* outputs */ \ |
|
157 |
, [from] "+Q" (_from) \ |
|
158 |
, [toaddr] "=a" (toaddr) \ |
|
159 |
, [fromaddr] "=a" (fromaddr) \ |
|
160 |
: \ |
|
161 |
: "cc" /* clobbered */ \ |
|
162 |
); \ |
|
163 |
} |
|
164 |
#define MOVE8_ATOMIC_1(_to,_from) { \ |
|
165 |
unsigned long toaddr; \ |
|
166 |
unsigned long fromaddr; \ |
|
167 |
asm( \ |
|
168 |
"LG %[toaddr],%[to] \n\t" /* address of to area */ \ |
|
169 |
"LG %[fromaddr],%[from] \n\t" /* address of from area */ \ |
|
170 |
"MVC 0(8,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \ |
|
171 |
: [to] "+Q" (_to) /* outputs */ \ |
|
172 |
, [from] "+Q" (_from) \ |
|
173 |
, [toaddr] "=a" (toaddr) \ |
|
174 |
, [fromaddr] "=a" (fromaddr) \ |
|
175 |
: \ |
|
176 |
: "cc" /* clobbered */ \ |
|
177 |
); \ |
|
178 |
} |
|
179 |
||
180 |
//-------------------------------------------------------------- |
|
181 |
// Atomic copying of 8-byte entities. |
|
182 |
// Conjoint/disjoint property does not matter. Entities are first |
|
183 |
// loaded and then stored. |
|
184 |
// _to and _from must be 8-byte aligned. |
|
185 |
//-------------------------------------------------------------- |
|
186 |
#define COPY8_ATOMIC_4(_to,_from) { \ |
|
187 |
unsigned long toaddr; \ |
|
188 |
asm( \ |
|
189 |
"LG 3,%[from] \n\t" /* address of from area */ \ |
|
190 |
"LG %[toaddr],%[to] \n\t" /* address of to area */ \ |
|
191 |
"LMG 0,3,0(3) \n\t" /* load data */ \ |
|
192 |
"STMG 0,3,0(%[toaddr]) \n\t" /* store data */ \ |
|
193 |
: [to] "+Q" (_to) /* outputs */ \ |
|
194 |
, [from] "+Q" (_from) /* outputs */ \ |
|
195 |
, [toaddr] "=a" (toaddr) /* inputs */ \ |
|
196 |
: \ |
|
197 |
: "cc", "r0", "r1", "r2", "r3" /* clobbered */ \ |
|
198 |
); \ |
|
199 |
} |
|
200 |
#define COPY8_ATOMIC_3(_to,_from) { \ |
|
201 |
unsigned long toaddr; \ |
|
202 |
asm( \ |
|
203 |
"LG 2,%[from] \n\t" /* address of from area */ \ |
|
204 |
"LG %[toaddr],%[to] \n\t" /* address of to area */ \ |
|
205 |
"LMG 0,2,0(2) \n\t" /* load data */ \ |
|
206 |
"STMG 0,2,0(%[toaddr]) \n\t" /* store data */ \ |
|
207 |
: [to] "+Q" (_to) /* outputs */ \ |
|
208 |
, [from] "+Q" (_from) /* outputs */ \ |
|
209 |
, [toaddr] "=a" (toaddr) /* inputs */ \ |
|
210 |
: \ |
|
211 |
: "cc", "r0", "r1", "r2" /* clobbered */ \ |
|
212 |
); \ |
|
213 |
} |
|
214 |
#define COPY8_ATOMIC_2(_to,_from) { \ |
|
215 |
unsigned long toaddr; \ |
|
216 |
asm( \ |
|
217 |
"LG 1,%[from] \n\t" /* address of from area */ \ |
|
218 |
"LG %[toaddr],%[to] \n\t" /* address of to area */ \ |
|
219 |
"LMG 0,1,0(1) \n\t" /* load data */ \ |
|
220 |
"STMG 0,1,0(%[toaddr]) \n\t" /* store data */ \ |
|
221 |
: [to] "+Q" (_to) /* outputs */ \ |
|
222 |
, [from] "+Q" (_from) /* outputs */ \ |
|
223 |
, [toaddr] "=a" (toaddr) /* inputs */ \ |
|
224 |
: \ |
|
225 |
: "cc", "r0", "r1" /* clobbered */ \ |
|
226 |
); \ |
|
227 |
} |
|
228 |
#define COPY8_ATOMIC_1(_to,_from) { \ |
|
229 |
unsigned long addr; \ |
|
230 |
asm( \ |
|
231 |
"LG %[addr],%[from] \n\t" /* address of from area */ \ |
|
232 |
"LG 0,0(0,%[addr]) \n\t" /* load data */ \ |
|
233 |
"LG %[addr],%[to] \n\t" /* address of to area */ \ |
|
234 |
"STG 0,0(0,%[addr]) \n\t" /* store data */ \ |
|
235 |
: [to] "+Q" (_to) /* outputs */ \ |
|
236 |
, [from] "+Q" (_from) /* outputs */ \ |
|
237 |
, [addr] "=a" (addr) /* inputs */ \ |
|
238 |
: \ |
|
239 |
: "cc", "r0" /* clobbered */ \ |
|
240 |
); \ |
|
241 |
} |
|
242 |
||
243 |
//-------------------------------------------------------------- |
|
244 |
// Atomic copying of 4-byte entities. |
|
245 |
// Exactly 4 (four) entities are copied. |
|
246 |
// Conjoint/disjoint property does not matter. Entities are first |
|
247 |
// loaded and then stored. |
|
248 |
// _to and _from must be 4-byte aligned. |
|
249 |
//-------------------------------------------------------------- |
|
250 |
#define COPY4_ATOMIC_4(_to,_from) { \ |
|
251 |
unsigned long toaddr; \ |
|
252 |
asm( \ |
|
253 |
"LG 3,%[from] \n\t" /* address of from area */ \ |
|
254 |
"LG %[toaddr],%[to] \n\t" /* address of to area */ \ |
|
255 |
"LM 0,3,0(3) \n\t" /* load data */ \ |
|
256 |
"STM 0,3,0(%[toaddr]) \n\t" /* store data */ \ |
|
257 |
: [to] "+Q" (_to) /* outputs */ \ |
|
258 |
, [from] "+Q" (_from) /* outputs */ \ |
|
259 |
, [toaddr] "=a" (toaddr) /* inputs */ \ |
|
260 |
: \ |
|
261 |
: "cc", "r0", "r1", "r2", "r3" /* clobbered */ \ |
|
262 |
); \ |
|
263 |
} |
|
264 |
#define COPY4_ATOMIC_3(_to,_from) { \ |
|
265 |
unsigned long toaddr; \ |
|
266 |
asm( \ |
|
267 |
"LG 2,%[from] \n\t" /* address of from area */ \ |
|
268 |
"LG %[toaddr],%[to] \n\t" /* address of to area */ \ |
|
269 |
"LM 0,2,0(2) \n\t" /* load data */ \ |
|
270 |
"STM 0,2,0(%[toaddr]) \n\t" /* store data */ \ |
|
271 |
: [to] "+Q" (_to) /* outputs */ \ |
|
272 |
, [from] "+Q" (_from) /* outputs */ \ |
|
273 |
, [toaddr] "=a" (toaddr) /* inputs */ \ |
|
274 |
: \ |
|
275 |
: "cc", "r0", "r1", "r2" /* clobbered */ \ |
|
276 |
); \ |
|
277 |
} |
|
278 |
#define COPY4_ATOMIC_2(_to,_from) { \ |
|
279 |
unsigned long toaddr; \ |
|
280 |
asm( \ |
|
281 |
"LG 1,%[from] \n\t" /* address of from area */ \ |
|
282 |
"LG %[toaddr],%[to] \n\t" /* address of to area */ \ |
|
283 |
"LM 0,1,0(1) \n\t" /* load data */ \ |
|
284 |
"STM 0,1,0(%[toaddr]) \n\t" /* store data */ \ |
|
285 |
: [to] "+Q" (_to) /* outputs */ \ |
|
286 |
, [from] "+Q" (_from) /* outputs */ \ |
|
287 |
, [toaddr] "=a" (toaddr) /* inputs */ \ |
|
288 |
: \ |
|
289 |
: "cc", "r0", "r1" /* clobbered */ \ |
|
290 |
); \ |
|
291 |
} |
|
292 |
#define COPY4_ATOMIC_1(_to,_from) { \ |
|
293 |
unsigned long addr; \ |
|
294 |
asm( \ |
|
295 |
"LG %[addr],%[from] \n\t" /* address of from area */ \ |
|
296 |
"L 0,0(0,%[addr]) \n\t" /* load data */ \ |
|
297 |
"LG %[addr],%[to] \n\t" /* address of to area */ \ |
|
298 |
"ST 0,0(0,%[addr]) \n\t" /* store data */ \ |
|
299 |
: [to] "+Q" (_to) /* outputs */ \ |
|
300 |
, [from] "+Q" (_from) /* outputs */ \ |
|
301 |
, [addr] "=a" (addr) /* inputs */ \ |
|
302 |
: \ |
|
303 |
: "cc", "r0" /* clobbered */ \ |
|
304 |
); \ |
|
305 |
} |
|
306 |
||
307 |
#if 0 // Waiting for gcc to support EXRL. |
|
308 |
#define MVC_MEMCOPY(_to,_from,_len) \ |
|
309 |
if (VM_Version::has_ExecuteExtensions()) { \ |
|
310 |
asm("\t" \ |
|
311 |
" LAY 1,-1(0,%[len]) \n\t" /* decr for MVC */ \ |
|
312 |
" EXRL 1,1f \n\t" /* execute MVC instr */ \ |
|
313 |
" BRC 15,2f \n\t" /* skip template */ \ |
|
314 |
"1: MVC 0(%[len],%[to]),0(%[from]) \n\t" \ |
|
315 |
"2: BCR 0,0 \n\t" \ |
|
316 |
: [to] "+Q" (_to) /* outputs */ \ |
|
317 |
, [from] "+Q" (_from) /* outputs */ \ |
|
318 |
: [len] "r" (_len) /* inputs */ \ |
|
319 |
: "cc", "r1" /* clobbered */ \ |
|
320 |
); \ |
|
321 |
} else { \ |
|
322 |
asm("\t" \ |
|
323 |
" LARL 2,3f \n\t" \ |
|
324 |
" LAY 1,-1(0,%[len]) \n\t" /* decr for MVC */ \ |
|
325 |
" EX 1,0(2) \n\t" /* execute MVC instr */ \ |
|
326 |
" BRC 15,4f \n\t" /* skip template */ \ |
|
327 |
"3: MVC 0(%[len],%[to]),0(%[from]) \n\t" \ |
|
328 |
"4: BCR 0,0 \n\t" \ |
|
329 |
: [to] "+Q" (_to) /* outputs */ \ |
|
330 |
, [from] "+Q" (_from) /* outputs */ \ |
|
331 |
: [len] "r" (_len) /* inputs */ \ |
|
332 |
: "cc", "r1", "r2" /* clobbered */ \ |
|
333 |
); \ |
|
334 |
} |
|
335 |
#else |
|
336 |
#define MVC_MEMCOPY(_to,_from,_len) \ |
|
337 |
{ unsigned long toaddr; unsigned long tolen; \ |
|
338 |
unsigned long fromaddr; unsigned long target; \ |
|
339 |
asm("\t" \ |
|
340 |
" LTGR %[tolen],%[len] \n\t" /* decr for MVC */ \ |
|
341 |
" BRC 8,2f \n\t" /* do nothing for l=0*/ \ |
|
342 |
" AGHI %[tolen],-1 \n\t" \ |
|
343 |
" LG %[toaddr],%[to] \n\t" \ |
|
344 |
" LG %[fromaddr],%[from] \n\t" \ |
|
345 |
" LARL %[target],1f \n\t" /* addr of MVC instr */ \ |
|
346 |
" EX %[tolen],0(%[target]) \n\t" /* execute MVC instr */ \ |
|
347 |
" BRC 15,2f \n\t" /* skip template */ \ |
|
348 |
"1: MVC 0(1,%[toaddr]),0(%[fromaddr]) \n\t" \ |
|
349 |
"2: BCR 0,0 \n\t" /* nop a branch target*/\ |
|
350 |
: [to] "+Q" (_to) /* outputs */ \ |
|
351 |
, [from] "+Q" (_from) \ |
|
352 |
, [tolen] "=a" (tolen) \ |
|
353 |
, [toaddr] "=a" (toaddr) \ |
|
354 |
, [fromaddr] "=a" (fromaddr) \ |
|
355 |
, [target] "=a" (target) \ |
|
356 |
: [len] "r" (_len) /* inputs */ \ |
|
357 |
: "cc" /* clobbered */ \ |
|
358 |
); \ |
|
359 |
} |
|
360 |
#endif |
|
361 |
||
362 |
#if 0 // code snippet to be used for debugging |
|
363 |
/* ASSERT code BEGIN */ \ |
|
364 |
" LARL %[len],5f \n\t" \ |
|
365 |
" LARL %[mta],4f \n\t" \ |
|
366 |
" SLGR %[len],%[mta] \n\t" \ |
|
367 |
" CGHI %[len],16 \n\t" \ |
|
368 |
" BRC 7,9f \n\t" /* block size != 16 */ \ |
|
369 |
\ |
|
370 |
" LARL %[len],1f \n\t" \ |
|
371 |
" SLGR %[len],%[mta] \n\t" \ |
|
372 |
" CGHI %[len],256 \n\t" \ |
|
373 |
" BRC 7,9f \n\t" /* list len != 256 */ \ |
|
374 |
\ |
|
375 |
" LGR 0,0 \n\t" /* artificial SIGILL */ \ |
|
376 |
"9: BRC 7,-2 \n\t" \ |
|
377 |
" LARL %[mta],1f \n\t" /* restore MVC table begin */ \ |
|
378 |
/* ASSERT code END */ |
|
379 |
#endif |
|
380 |
||
381 |
// Optimized copying for data less than 4k |
|
382 |
// - no destructive overlap |
|
383 |
// - 0 <= _n_bytes <= 4096 |
|
384 |
// This macro needs to be gcc-compiled with -march=z990. Otherwise, the |
|
385 |
// LAY instruction is not available. |
|
386 |
#define MVC_MULTI(_to,_from,_n_bytes) \ |
|
387 |
{ unsigned long toaddr; \ |
|
388 |
unsigned long fromaddr; \ |
|
389 |
unsigned long movetable; \ |
|
390 |
unsigned long len; \ |
|
391 |
asm("\t" \ |
|
392 |
" LTGFR %[len],%[nby] \n\t" \ |
|
393 |
" LG %[ta],%[to] \n\t" /* address of to area */ \ |
|
394 |
" BRC 8,1f \n\t" /* nothing to copy */ \ |
|
395 |
\ |
|
396 |
" NILL %[nby],255 \n\t" /* # bytes mod 256 */ \ |
|
397 |
" LG %[fa],%[from] \n\t" /* address of from area */ \ |
|
398 |
" BRC 8,3f \n\t" /* no rest, skip copying */ \ |
|
399 |
\ |
|
400 |
" LARL %[mta],2f \n\t" /* MVC template addr */ \ |
|
401 |
" AHI %[nby],-1 \n\t" /* adjust for EX MVC */ \ |
|
402 |
\ |
|
403 |
" EX %[nby],0(%[mta]) \n\t" /* only rightmost */ \ |
|
404 |
/* 8 bits of nby used */ \ |
|
405 |
/* Since nby is <= 4096 on entry to this code, we do need */ \ |
|
406 |
/* no zero extension before using it in addr calc. */ \ |
|
407 |
" LA %[fa],1(%[nby],%[fa]) \n\t"/* adjust from addr */ \ |
|
408 |
" LA %[ta],1(%[nby],%[ta]) \n\t"/* adjust to addr */ \ |
|
409 |
\ |
|
410 |
"3: SRAG %[nby],%[len],8 \n\t" /* # cache lines */ \ |
|
411 |
" LARL %[mta],1f \n\t" /* MVC table begin */ \ |
|
412 |
" BRC 8,1f \n\t" /* nothing to copy */ \ |
|
413 |
\ |
|
414 |
/* Insert ASSERT code here if required. */ \ |
|
415 |
\ |
|
416 |
\ |
|
417 |
" LNGFR %[nby],%[nby] \n\t" /* negative offset into */ \ |
|
418 |
" SLLG %[nby],%[nby],4 \n\t" /* MVC table 16-byte blocks */ \ |
|
419 |
" BC 15,0(%[nby],%[mta]) \n\t" /* branch to block #ncl */ \ |
|
420 |
\ |
|
421 |
"2: MVC 0(1,%[ta]),0(%[fa]) \n\t" /* MVC template */ \ |
|
422 |
\ |
|
423 |
"4: MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 4096 == l */ \ |
|
424 |
" LAY %[ta],256(0,%[ta]) \n\t" \ |
|
425 |
" LA %[fa],256(0,%[fa]) \n\t" \ |
|
426 |
"5: MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 3840 <= l < 4096 */ \ |
|
427 |
" LAY %[ta],256(0,%[ta]) \n\t" \ |
|
428 |
" LA %[fa],256(0,%[fa]) \n\t" \ |
|
429 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 3548 <= l < 3328 */ \ |
|
430 |
" LAY %[ta],256(0,%[ta]) \n\t" \ |
|
431 |
" LA %[fa],256(0,%[fa]) \n\t" \ |
|
432 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 3328 <= l < 3328 */ \ |
|
433 |
" LAY %[ta],256(0,%[ta]) \n\t" \ |
|
434 |
" LA %[fa],256(0,%[fa]) \n\t" \ |
|
435 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 3072 <= l < 3328 */ \ |
|
436 |
" LAY %[ta],256(0,%[ta]) \n\t" \ |
|
437 |
" LA %[fa],256(0,%[fa]) \n\t" \ |
|
438 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 2816 <= l < 3072 */ \ |
|
439 |
" LAY %[ta],256(0,%[ta]) \n\t" \ |
|
440 |
" LA %[fa],256(0,%[fa]) \n\t" \ |
|
441 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 2560 <= l < 2816 */ \ |
|
442 |
" LAY %[ta],256(0,%[ta]) \n\t" \ |
|
443 |
" LA %[fa],256(0,%[fa]) \n\t" \ |
|
444 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 2304 <= l < 2560 */ \ |
|
445 |
" LAY %[ta],256(0,%[ta]) \n\t" \ |
|
446 |
" LA %[fa],256(0,%[fa]) \n\t" \ |
|
447 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 2048 <= l < 2304 */ \ |
|
448 |
" LAY %[ta],256(0,%[ta]) \n\t" \ |
|
449 |
" LA %[fa],256(0,%[fa]) \n\t" \ |
|
450 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 1792 <= l < 2048 */ \ |
|
451 |
" LAY %[ta],256(0,%[ta]) \n\t" \ |
|
452 |
" LA %[fa],256(0,%[fa]) \n\t" \ |
|
453 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 1536 <= l < 1792 */ \ |
|
454 |
" LAY %[ta],256(0,%[ta]) \n\t" \ |
|
455 |
" LA %[fa],256(0,%[fa]) \n\t" \ |
|
456 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 1280 <= l < 1536 */ \ |
|
457 |
" LAY %[ta],256(0,%[ta]) \n\t" \ |
|
458 |
" LA %[fa],256(0,%[fa]) \n\t" \ |
|
459 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 1024 <= l < 1280 */ \ |
|
460 |
" LAY %[ta],256(0,%[ta]) \n\t" \ |
|
461 |
" LA %[fa],256(0,%[fa]) \n\t" \ |
|
462 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 768 <= l < 1024 */ \ |
|
463 |
" LAY %[ta],256(0,%[ta]) \n\t" \ |
|
464 |
" LA %[fa],256(0,%[fa]) \n\t" \ |
|
465 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 512 <= l < 768 */ \ |
|
466 |
" LAY %[ta],256(0,%[ta]) \n\t" \ |
|
467 |
" LA %[fa],256(0,%[fa]) \n\t" \ |
|
468 |
" MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 256 <= l < 512 */ \ |
|
469 |
" LAY %[ta],256(0,%[ta]) \n\t" \ |
|
470 |
" LA %[fa],256(0,%[fa]) \n\t" \ |
|
471 |
"1: BCR 0,0 \n\t" /* nop as branch target */ \ |
|
472 |
: [to] "+Q" (_to) /* outputs */ \ |
|
473 |
, [from] "+Q" (_from) \ |
|
474 |
, [ta] "=a" (toaddr) \ |
|
475 |
, [fa] "=a" (fromaddr) \ |
|
476 |
, [mta] "=a" (movetable) \ |
|
477 |
, [nby] "+a" (_n_bytes) \ |
|
478 |
, [len] "=a" (len) \ |
|
479 |
: \ |
|
480 |
: "cc" /* clobbered */ \ |
|
481 |
); \ |
|
482 |
} |
|
483 |
||
484 |
#define MVCLE_MEMCOPY(_to,_from,_len) \ |
|
485 |
asm( \ |
|
486 |
" LG 0,%[to] \n\t" /* address of to area */ \ |
|
487 |
" LG 2,%[from] \n\t" /* address of from area */ \ |
|
488 |
" LGR 1,%[len] \n\t" /* len of to area */ \ |
|
489 |
" LGR 3,%[len] \n\t" /* len of from area */ \ |
|
490 |
"1: MVCLE 0,2,176 \n\t" /* copy storage, bypass cache (0xb0) */ \ |
|
491 |
" BRC 1,1b \n\t" /* retry if interrupted */ \ |
|
492 |
: [to] "+Q" (_to) /* outputs */ \ |
|
493 |
, [from] "+Q" (_from) /* outputs */ \ |
|
494 |
: [len] "r" (_len) /* inputs */ \ |
|
495 |
: "cc", "r0", "r1", "r2", "r3" /* clobbered */ \ |
|
496 |
); |
|
497 |
||
498 |
#define MVCLE_MEMINIT(_to,_val,_len) \ |
|
499 |
asm( \ |
|
500 |
" LG 0,%[to] \n\t" /* address of to area */ \ |
|
501 |
" LGR 1,%[len] \n\t" /* len of to area */ \ |
|
502 |
" XGR 3,3 \n\t" /* from area len = 0 */ \ |
|
503 |
"1: MVCLE 0,2,0(%[val]) \n\t" /* init storage */ \ |
|
504 |
" BRC 1,1b \n\t" /* retry if interrupted */ \ |
|
505 |
: [to] "+Q" (_to) /* outputs */ \ |
|
506 |
: [len] "r" (_len) /* inputs */ \ |
|
507 |
, [val] "r" (_val) /* inputs */ \ |
|
508 |
: "cc", "r0", "r1", "r3" /* clobbered */ \ |
|
509 |
); |
|
510 |
#define MVCLE_MEMZERO(_to,_len) \ |
|
511 |
asm( \ |
|
512 |
" LG 0,%[to] \n\t" /* address of to area */ \ |
|
513 |
" LGR 1,%[len] \n\t" /* len of to area */ \ |
|
514 |
" XGR 3,3 \n\t" /* from area len = 0 */ \ |
|
515 |
"1: MVCLE 0,2,0 \n\t" /* clear storage */ \ |
|
516 |
" BRC 1,1b \n\t" /* retry if interrupted */ \ |
|
517 |
: [to] "+Q" (_to) /* outputs */ \ |
|
518 |
: [len] "r" (_len) /* inputs */ \ |
|
519 |
: "cc", "r0", "r1", "r3" /* clobbered */ \ |
|
520 |
); |
|
521 |
||
522 |
// Clear a stretch of memory, 0 <= _len <= 256. |
|
523 |
// There is no alignment prereq. |
|
524 |
// There is no test for len out of range specified above. |
|
525 |
#define XC_MEMZERO_256(_to,_len) \ |
|
526 |
{ unsigned long toaddr; unsigned long tolen; \ |
|
527 |
unsigned long target; \ |
|
528 |
asm("\t" \ |
|
529 |
" LTGR %[tolen],%[len] \n\t" /* decr for MVC */ \ |
|
530 |
" BRC 8,2f \n\t" /* do nothing for l=0*/ \ |
|
531 |
" AGHI %[tolen],-1 \n\t" /* adjust for EX XC */ \ |
|
532 |
" LARL %[target],1f \n\t" /* addr of XC instr */ \ |
|
533 |
" LG %[toaddr],%[to] \n\t" /* addr of data area */ \ |
|
534 |
" EX %[tolen],0(%[target]) \n\t" /* execute MVC instr */ \ |
|
535 |
" BRC 15,2f \n\t" /* skip template */ \ |
|
536 |
"1: XC 0(1,%[toaddr]),0(%[toaddr]) \n\t" \ |
|
537 |
"2: BCR 0,0 \n\t" /* nop a branch target*/\ |
|
538 |
: [to] "+Q" (_to) /* outputs */ \ |
|
539 |
, [tolen] "=a" (tolen) \ |
|
540 |
, [toaddr] "=a" (toaddr) \ |
|
541 |
, [target] "=a" (target) \ |
|
542 |
: [len] "r" (_len) /* inputs */ \ |
|
543 |
: "cc" /* clobbered */ \ |
|
544 |
); \ |
|
545 |
} |
|
546 |
||
547 |
// Clear a stretch of memory, 256 < _len. |
|
548 |
// XC_MEMZERO_256 may be used to clear shorter areas. |
|
549 |
// |
|
550 |
// The code |
|
551 |
// - first zeroes a few bytes to align on a HeapWord. |
|
552 |
// This step is currently inactive because all calls seem |
|
553 |
// to have their data aligned on HeapWord boundaries. |
|
554 |
// - then zeroes a few HeapWords to align on a cache line. |
|
555 |
// - then zeroes entire cache lines in a loop. |
|
556 |
// - then zeroes the remaining (partial) cache line. |
|
557 |
#if 1 |
|
558 |
#define XC_MEMZERO_ANY(_to,_len) \ |
|
559 |
{ unsigned long toaddr; unsigned long tolen; \ |
|
560 |
unsigned long len8; unsigned long len256; \ |
|
561 |
unsigned long target; unsigned long lenx; \ |
|
562 |
asm("\t" \ |
|
563 |
" LTGR %[tolen],%[len] \n\t" /* */ \ |
|
564 |
" BRC 8,2f \n\t" /* do nothing for l=0*/ \ |
|
565 |
" LG %[toaddr],%[to] \n\t" /* addr of data area */ \ |
|
566 |
" LARL %[target],1f \n\t" /* addr of XC instr */ \ |
|
567 |
" " \ |
|
568 |
" LCGR %[len256],%[toaddr] \n\t" /* cache line alignment */\ |
|
569 |
" NILL %[len256],0xff \n\t" \ |
|
570 |
" BRC 8,4f \n\t" /* already aligned */ \ |
|
571 |
" NILH %[len256],0x00 \n\t" /* zero extend */ \ |
|
572 |
" LLGFR %[len256],%[len256] \n\t" \ |
|
573 |
" LAY %[lenx],-1(,%[len256]) \n\t" \ |
|
574 |
" EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \ |
|
575 |
" LA %[toaddr],0(%[len256],%[toaddr]) \n\t" \ |
|
576 |
" SGR %[tolen],%[len256] \n\t" /* adjust len */ \ |
|
577 |
" " \ |
|
578 |
"4: SRAG %[lenx],%[tolen],8 \n\t" /* # cache lines */ \ |
|
579 |
" BRC 8,6f \n\t" /* no full cache lines */ \ |
|
580 |
"5: XC 0(256,%[toaddr]),0(%[toaddr]) \n\t" \ |
|
581 |
" LA %[toaddr],256(,%[toaddr]) \n\t" \ |
|
582 |
" BRCTG %[lenx],5b \n\t" /* iterate */ \ |
|
583 |
" " \ |
|
584 |
"6: NILL %[tolen],0xff \n\t" /* leftover bytes */ \ |
|
585 |
" BRC 8,2f \n\t" /* done if none */ \ |
|
586 |
" LAY %[lenx],-1(,%[tolen]) \n\t" \ |
|
587 |
" EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \ |
|
588 |
" BRC 15,2f \n\t" /* skip template */ \ |
|
589 |
" " \ |
|
590 |
"1: XC 0(1,%[toaddr]),0(%[toaddr]) \n\t" \ |
|
591 |
"2: BCR 0,0 \n\t" /* nop a branch target */ \ |
|
592 |
: [to] "+Q" (_to) /* outputs */ \ |
|
593 |
, [lenx] "=a" (lenx) \ |
|
594 |
, [len256] "=a" (len256) \ |
|
595 |
, [tolen] "=a" (tolen) \ |
|
596 |
, [toaddr] "=a" (toaddr) \ |
|
597 |
, [target] "=a" (target) \ |
|
598 |
: [len] "r" (_len) /* inputs */ \ |
|
599 |
: "cc" /* clobbered */ \ |
|
600 |
); \ |
|
601 |
} |
|
602 |
#else |
|
603 |
#define XC_MEMZERO_ANY(_to,_len) \ |
|
604 |
{ unsigned long toaddr; unsigned long tolen; \ |
|
605 |
unsigned long len8; unsigned long len256; \ |
|
606 |
unsigned long target; unsigned long lenx; \ |
|
607 |
asm("\t" \ |
|
608 |
" LTGR %[tolen],%[len] \n\t" /* */ \ |
|
609 |
" BRC 8,2f \n\t" /* do nothing for l=0*/ \ |
|
610 |
" LG %[toaddr],%[to] \n\t" /* addr of data area */ \ |
|
611 |
" LARL %[target],1f \n\t" /* addr of XC instr */ \ |
|
612 |
" " \ |
|
613 |
" LCGR %[len8],%[toaddr] \n\t" /* HeapWord alignment */ \ |
|
614 |
" NILL %[len8],0x07 \n\t" \ |
|
615 |
" BRC 8,3f \n\t" /* already aligned */ \ |
|
616 |
" NILH %[len8],0x00 \n\t" /* zero extend */ \ |
|
617 |
" LLGFR %[len8],%[len8] \n\t" \ |
|
618 |
" LAY %[lenx],-1(,%[len8]) \n\t" \ |
|
619 |
" EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \ |
|
620 |
" LA %[toaddr],0(%[len8],%[toaddr]) \n\t" \ |
|
621 |
" SGR %[tolen],%[len8] \n\t" /* adjust len */ \ |
|
622 |
" " \ |
|
623 |
"3: LCGR %[len256],%[toaddr] \n\t" /* cache line alignment */\ |
|
624 |
" NILL %[len256],0xff \n\t" \ |
|
625 |
" BRC 8,4f \n\t" /* already aligned */ \ |
|
626 |
" NILH %[len256],0x00 \n\t" /* zero extend */ \ |
|
627 |
" LLGFR %[len256],%[len256] \n\t" \ |
|
628 |
" LAY %[lenx],-1(,%[len256]) \n\t" \ |
|
629 |
" EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \ |
|
630 |
" LA %[toaddr],0(%[len256],%[toaddr]) \n\t" \ |
|
631 |
" SGR %[tolen],%[len256] \n\t" /* adjust len */ \ |
|
632 |
" " \ |
|
633 |
"4: SRAG %[lenx],%[tolen],8 \n\t" /* # cache lines */ \ |
|
634 |
" BRC 8,6f \n\t" /* no full cache lines */ \ |
|
635 |
"5: XC 0(256,%[toaddr]),0(%[toaddr]) \n\t" \ |
|
636 |
" LA %[toaddr],256(,%[toaddr]) \n\t" \ |
|
637 |
" BRCTG %[lenx],5b \n\t" /* iterate */ \ |
|
638 |
" " \ |
|
639 |
"6: NILL %[tolen],0xff \n\t" /* leftover bytes */ \ |
|
640 |
" BRC 8,2f \n\t" /* done if none */ \ |
|
641 |
" LAY %[lenx],-1(,%[tolen]) \n\t" \ |
|
642 |
" EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \ |
|
643 |
" BRC 15,2f \n\t" /* skip template */ \ |
|
644 |
" " \ |
|
645 |
"1: XC 0(1,%[toaddr]),0(%[toaddr]) \n\t" \ |
|
646 |
"2: BCR 0,0 \n\t" /* nop a branch target */ \ |
|
647 |
: [to] "+Q" (_to) /* outputs */ \ |
|
648 |
, [lenx] "=a" (lenx) \ |
|
649 |
, [len8] "=a" (len8) \ |
|
650 |
, [len256] "=a" (len256) \ |
|
651 |
, [tolen] "=a" (tolen) \ |
|
652 |
, [toaddr] "=a" (toaddr) \ |
|
653 |
, [target] "=a" (target) \ |
|
654 |
: [len] "r" (_len) /* inputs */ \ |
|
655 |
: "cc" /* clobbered */ \ |
|
656 |
); \ |
|
657 |
} |
|
658 |
#endif |
|
659 |
#endif // USE_INLINE_ASM |
|
660 |
||
661 |
//*************************************// |
|
662 |
// D I S J O I N T C O P Y I N G // |
|
663 |
//*************************************// |
|
664 |
||
48951 | 665 |
static void pd_aligned_disjoint_words(const HeapWord* from, HeapWord* to, size_t count) { |
42065 | 666 |
// JVM2008: very frequent, some tests frequent. |
667 |
||
668 |
// Copy HeapWord (=DW) aligned storage. Use MVCLE in inline-asm code. |
|
669 |
// MVCLE guarantees DW concurrent (i.e. atomic) accesses if both the addresses of the operands |
|
670 |
// are DW aligned and the length is an integer multiple of a DW. Should always be true here. |
|
671 |
// |
|
672 |
// No special exploit needed. H/W discovers suitable situations itself. |
|
673 |
// |
|
674 |
// For large chunks of memory, exploit special H/W support of z/Architecture: |
|
675 |
// 1) copy short piece of memory to page-align address(es) |
|
676 |
// 2) copy largest part (all contained full pages) of memory using mvcle instruction. |
|
677 |
// z/Architecture processors have special H/W support for page-aligned storage |
|
678 |
// where len is an int multiple of page size. In that case, up to 4 cache lines are |
|
679 |
// processed in parallel and L1 cache is not polluted. |
|
680 |
// 3) copy the remaining piece of memory. |
|
681 |
// |
|
682 |
#ifdef USE_INLINE_ASM |
|
683 |
jbyte* to_bytes = (jbyte*)to; |
|
684 |
jbyte* from_bytes = (jbyte*)from; |
|
685 |
size_t len_bytes = count*HeapWordSize; |
|
686 |
||
687 |
// Optimized copying for data less than 4k |
|
688 |
switch (count) { |
|
689 |
case 0: return; |
|
690 |
case 1: MOVE8_ATOMIC_1(to,from) |
|
691 |
return; |
|
692 |
case 2: MOVE8_ATOMIC_2(to,from) |
|
693 |
return; |
|
694 |
// case 3: MOVE8_ATOMIC_3(to,from) |
|
695 |
// return; |
|
696 |
// case 4: MOVE8_ATOMIC_4(to,from) |
|
697 |
// return; |
|
698 |
default: |
|
699 |
if (len_bytes <= 4096) { |
|
700 |
MVC_MULTI(to,from,len_bytes) |
|
701 |
return; |
|
702 |
} |
|
703 |
// else |
|
704 |
MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) |
|
705 |
return; |
|
706 |
} |
|
707 |
#else |
|
708 |
// Fallback code. |
|
709 |
switch (count) { |
|
710 |
case 0: |
|
711 |
return; |
|
712 |
||
713 |
case 1: |
|
714 |
*to = *from; |
|
715 |
return; |
|
716 |
||
717 |
case 2: |
|
718 |
*to++ = *from++; |
|
719 |
*to = *from; |
|
720 |
return; |
|
721 |
||
722 |
case 3: |
|
723 |
*to++ = *from++; |
|
724 |
*to++ = *from++; |
|
725 |
*to = *from; |
|
726 |
return; |
|
727 |
||
728 |
case 4: |
|
729 |
*to++ = *from++; |
|
730 |
*to++ = *from++; |
|
731 |
*to++ = *from++; |
|
732 |
*to = *from; |
|
733 |
return; |
|
734 |
||
735 |
default: |
|
736 |
while (count-- > 0) |
|
737 |
*(to++) = *(from++); |
|
738 |
return; |
|
739 |
} |
|
740 |
#endif |
|
741 |
} |
|
742 |
||
48956 | 743 |
static void pd_disjoint_words_atomic(const HeapWord* from, HeapWord* to, size_t count) { |
42065 | 744 |
// JVM2008: < 4k calls. |
745 |
assert(((((size_t)from) & 0x07L) | (((size_t)to) & 0x07L)) == 0, "No atomic copy w/o aligned data"); |
|
746 |
pd_aligned_disjoint_words(from, to, count); // Rare calls -> just delegate. |
|
747 |
} |
|
748 |
||
48956 | 749 |
static void pd_disjoint_words(const HeapWord* from, HeapWord* to, size_t count) { |
42065 | 750 |
// JVM2008: very rare. |
751 |
pd_aligned_disjoint_words(from, to, count); // Rare calls -> just delegate. |
|
752 |
} |
|
753 |
||
754 |
||
755 |
//*************************************// |
|
756 |
// C O N J O I N T C O P Y I N G // |
|
757 |
//*************************************// |
|
758 |
||
48951 | 759 |
static void pd_aligned_conjoint_words(const HeapWord* from, HeapWord* to, size_t count) { |
42065 | 760 |
// JVM2008: between some and lower end of frequent. |
761 |
||
762 |
#ifdef USE_INLINE_ASM |
|
763 |
size_t count_in = count; |
|
764 |
if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) { |
|
765 |
switch (count_in) { |
|
766 |
case 4: COPY8_ATOMIC_4(to,from) |
|
767 |
return; |
|
768 |
case 3: COPY8_ATOMIC_3(to,from) |
|
769 |
return; |
|
770 |
case 2: COPY8_ATOMIC_2(to,from) |
|
771 |
return; |
|
772 |
case 1: COPY8_ATOMIC_1(to,from) |
|
773 |
return; |
|
774 |
case 0: return; |
|
775 |
default: |
|
776 |
from += count_in; |
|
777 |
to += count_in; |
|
778 |
while (count_in-- > 0) |
|
779 |
*(--to) = *(--from); // Copy backwards, areas overlap destructively. |
|
780 |
return; |
|
781 |
} |
|
782 |
} |
|
783 |
// else |
|
784 |
jbyte* to_bytes = (jbyte*)to; |
|
785 |
jbyte* from_bytes = (jbyte*)from; |
|
786 |
size_t len_bytes = count_in*BytesPerLong; |
|
787 |
MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) |
|
788 |
return; |
|
789 |
#else |
|
790 |
// Fallback code. |
|
791 |
if (has_destructive_overlap((char*)from, (char*)to, count*BytesPerLong)) { |
|
792 |
HeapWord t1, t2, t3; |
|
793 |
switch (count) { |
|
794 |
case 0: |
|
795 |
return; |
|
796 |
||
797 |
case 1: |
|
798 |
*to = *from; |
|
799 |
return; |
|
800 |
||
801 |
case 2: |
|
802 |
t1 = *(from+1); |
|
803 |
*to = *from; |
|
804 |
*(to+1) = t1; |
|
805 |
return; |
|
806 |
||
807 |
case 3: |
|
808 |
t1 = *(from+1); |
|
809 |
t2 = *(from+2); |
|
810 |
*to = *from; |
|
811 |
*(to+1) = t1; |
|
812 |
*(to+2) = t2; |
|
813 |
return; |
|
814 |
||
815 |
case 4: |
|
816 |
t1 = *(from+1); |
|
817 |
t2 = *(from+2); |
|
818 |
t3 = *(from+3); |
|
819 |
*to = *from; |
|
820 |
*(to+1) = t1; |
|
821 |
*(to+2) = t2; |
|
822 |
*(to+3) = t3; |
|
823 |
return; |
|
824 |
||
825 |
default: |
|
826 |
from += count; |
|
827 |
to += count; |
|
828 |
while (count-- > 0) |
|
829 |
*(--to) = *(--from); // Copy backwards, areas overlap destructively. |
|
830 |
return; |
|
831 |
} |
|
832 |
} |
|
833 |
// else |
|
834 |
// Just delegate. HeapWords are optimally aligned anyway. |
|
835 |
pd_aligned_disjoint_words(from, to, count); |
|
836 |
#endif |
|
837 |
} |
|
838 |
||
48951 | 839 |
static void pd_conjoint_words(const HeapWord* from, HeapWord* to, size_t count) { |
42065 | 840 |
|
841 |
// Just delegate. HeapWords are optimally aligned anyway. |
|
842 |
pd_aligned_conjoint_words(from, to, count); |
|
843 |
} |
|
844 |
||
48951 | 845 |
static void pd_conjoint_bytes(const void* from, void* to, size_t count) { |
42065 | 846 |
|
847 |
#ifdef USE_INLINE_ASM |
|
848 |
size_t count_in = count; |
|
849 |
if (has_destructive_overlap((char*)from, (char*)to, count_in)) |
|
850 |
(void)memmove(to, from, count_in); |
|
851 |
else { |
|
852 |
jbyte* to_bytes = (jbyte*)to; |
|
853 |
jbyte* from_bytes = (jbyte*)from; |
|
854 |
size_t len_bytes = count_in; |
|
855 |
MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) |
|
856 |
} |
|
857 |
#else |
|
858 |
if (has_destructive_overlap((char*)from, (char*)to, count)) |
|
859 |
(void)memmove(to, from, count); |
|
860 |
else |
|
861 |
(void)memcpy(to, from, count); |
|
862 |
#endif |
|
863 |
} |
|
864 |
||
865 |
//**************************************************// |
|
866 |
// C O N J O I N T A T O M I C C O P Y I N G // |
|
867 |
//**************************************************// |
|
868 |
||
48951 | 869 |
static void pd_conjoint_bytes_atomic(const void* from, void* to, size_t count) { |
42065 | 870 |
// Call arraycopy stubs to do the job. |
871 |
pd_conjoint_bytes(from, to, count); // bytes are always accessed atomically. |
|
872 |
} |
|
873 |
||
48951 | 874 |
static void pd_conjoint_jshorts_atomic(const jshort* from, jshort* to, size_t count) { |
42065 | 875 |
|
876 |
#ifdef USE_INLINE_ASM |
|
877 |
size_t count_in = count; |
|
48951 | 878 |
if (has_destructive_overlap((const char*)from, (char*)to, count_in*BytesPerShort)) { |
42065 | 879 |
// Use optimizations from shared code where no z-specific optimization exists. |
880 |
copy_conjoint_jshorts_atomic(from, to, count); |
|
881 |
} else { |
|
882 |
jbyte* to_bytes = (jbyte*)to; |
|
883 |
jbyte* from_bytes = (jbyte*)from; |
|
884 |
size_t len_bytes = count_in*BytesPerShort; |
|
885 |
MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) |
|
886 |
} |
|
887 |
#else |
|
888 |
// Use optimizations from shared code where no z-specific optimization exists. |
|
889 |
copy_conjoint_jshorts_atomic(from, to, count); |
|
890 |
#endif |
|
891 |
} |
|
892 |
||
48951 | 893 |
static void pd_conjoint_jints_atomic(const jint* from, jint* to, size_t count) { |
42065 | 894 |
|
895 |
#ifdef USE_INLINE_ASM |
|
896 |
size_t count_in = count; |
|
48951 | 897 |
if (has_destructive_overlap((const char*)from, (char*)to, count_in*BytesPerInt)) { |
42065 | 898 |
switch (count_in) { |
899 |
case 4: COPY4_ATOMIC_4(to,from) |
|
900 |
return; |
|
901 |
case 3: COPY4_ATOMIC_3(to,from) |
|
902 |
return; |
|
903 |
case 2: COPY4_ATOMIC_2(to,from) |
|
904 |
return; |
|
905 |
case 1: COPY4_ATOMIC_1(to,from) |
|
906 |
return; |
|
907 |
case 0: return; |
|
908 |
default: |
|
909 |
// Use optimizations from shared code where no z-specific optimization exists. |
|
910 |
copy_conjoint_jints_atomic(from, to, count_in); |
|
911 |
return; |
|
912 |
} |
|
913 |
} |
|
914 |
// else |
|
915 |
jbyte* to_bytes = (jbyte*)to; |
|
916 |
jbyte* from_bytes = (jbyte*)from; |
|
917 |
size_t len_bytes = count_in*BytesPerInt; |
|
918 |
MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) |
|
919 |
#else |
|
920 |
// Use optimizations from shared code where no z-specific optimization exists. |
|
921 |
copy_conjoint_jints_atomic(from, to, count); |
|
922 |
#endif |
|
923 |
} |
|
924 |
||
48951 | 925 |
static void pd_conjoint_jlongs_atomic(const jlong* from, jlong* to, size_t count) { |
42065 | 926 |
|
927 |
#ifdef USE_INLINE_ASM |
|
928 |
size_t count_in = count; |
|
929 |
if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) { |
|
930 |
switch (count_in) { |
|
931 |
case 4: COPY8_ATOMIC_4(to,from) return; |
|
932 |
case 3: COPY8_ATOMIC_3(to,from) return; |
|
933 |
case 2: COPY8_ATOMIC_2(to,from) return; |
|
934 |
case 1: COPY8_ATOMIC_1(to,from) return; |
|
935 |
case 0: return; |
|
936 |
default: |
|
937 |
from += count_in; |
|
938 |
to += count_in; |
|
939 |
while (count_in-- > 0) { *(--to) = *(--from); } // Copy backwards, areas overlap destructively. |
|
940 |
return; |
|
941 |
} |
|
942 |
} |
|
943 |
// else { |
|
944 |
jbyte* to_bytes = (jbyte*)to; |
|
945 |
jbyte* from_bytes = (jbyte*)from; |
|
946 |
size_t len_bytes = count_in*BytesPerLong; |
|
947 |
MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) |
|
948 |
#else |
|
949 |
size_t count_in = count; |
|
950 |
if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) { |
|
951 |
if (count_in < 8) { |
|
952 |
from += count_in; |
|
953 |
to += count_in; |
|
954 |
while (count_in-- > 0) |
|
955 |
*(--to) = *(--from); // Copy backwards, areas overlap destructively. |
|
956 |
return; |
|
957 |
} |
|
958 |
// else { |
|
959 |
from += count_in-1; |
|
960 |
to += count_in-1; |
|
961 |
if (count_in&0x01) { |
|
962 |
*(to--) = *(from--); |
|
963 |
count_in--; |
|
964 |
} |
|
965 |
for (; count_in>0; count_in-=2) { |
|
966 |
*to = *from; |
|
967 |
*(to-1) = *(from-1); |
|
968 |
to -= 2; |
|
969 |
from -= 2; |
|
970 |
} |
|
971 |
} |
|
972 |
else |
|
48951 | 973 |
pd_aligned_disjoint_words((const HeapWord*)from, (HeapWord*)to, count_in); // rare calls -> just delegate. |
42065 | 974 |
#endif |
975 |
} |
|
976 |
||
48951 | 977 |
static void pd_conjoint_oops_atomic(const oop* from, oop* to, size_t count) { |
42065 | 978 |
|
979 |
#ifdef USE_INLINE_ASM |
|
980 |
size_t count_in = count; |
|
981 |
if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerOop)) { |
|
982 |
switch (count_in) { |
|
983 |
case 4: COPY8_ATOMIC_4(to,from) return; |
|
984 |
case 3: COPY8_ATOMIC_3(to,from) return; |
|
985 |
case 2: COPY8_ATOMIC_2(to,from) return; |
|
986 |
case 1: COPY8_ATOMIC_1(to,from) return; |
|
987 |
case 0: return; |
|
988 |
default: |
|
989 |
from += count_in; |
|
990 |
to += count_in; |
|
991 |
while (count_in-- > 0) { *(--to) = *(--from); } // Copy backwards, areas overlap destructively. |
|
992 |
return; |
|
993 |
} |
|
994 |
} |
|
995 |
// else |
|
996 |
jbyte* to_bytes = (jbyte*)to; |
|
997 |
jbyte* from_bytes = (jbyte*)from; |
|
998 |
size_t len_bytes = count_in*BytesPerOop; |
|
999 |
MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) |
|
1000 |
#else |
|
1001 |
size_t count_in = count; |
|
1002 |
if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerOop)) { |
|
1003 |
from += count_in; |
|
1004 |
to += count_in; |
|
1005 |
while (count_in-- > 0) *(--to) = *(--from); // Copy backwards, areas overlap destructively. |
|
1006 |
return; |
|
1007 |
} |
|
1008 |
// else |
|
1009 |
pd_aligned_disjoint_words((HeapWord*)from, (HeapWord*)to, count_in); // rare calls -> just delegate. |
|
1010 |
return; |
|
1011 |
#endif |
|
1012 |
} |
|
1013 |
||
48951 | 1014 |
static void pd_arrayof_conjoint_bytes(const HeapWord* from, HeapWord* to, size_t count) { |
42065 | 1015 |
pd_conjoint_bytes_atomic(from, to, count); |
1016 |
} |
|
1017 |
||
48951 | 1018 |
static void pd_arrayof_conjoint_jshorts(const HeapWord* from, HeapWord* to, size_t count) { |
1019 |
pd_conjoint_jshorts_atomic((const jshort*)from, (jshort*)to, count); |
|
42065 | 1020 |
} |
1021 |
||
48951 | 1022 |
static void pd_arrayof_conjoint_jints(const HeapWord* from, HeapWord* to, size_t count) { |
1023 |
pd_conjoint_jints_atomic((const jint*)from, (jint*)to, count); |
|
42065 | 1024 |
} |
1025 |
||
48951 | 1026 |
static void pd_arrayof_conjoint_jlongs(const HeapWord* from, HeapWord* to, size_t count) { |
1027 |
pd_conjoint_jlongs_atomic((const jlong*)from, (jlong*)to, count); |
|
42065 | 1028 |
} |
1029 |
||
48951 | 1030 |
static void pd_arrayof_conjoint_oops(const HeapWord* from, HeapWord* to, size_t count) { |
1031 |
pd_conjoint_oops_atomic((const oop*)from, (oop*)to, count); |
|
42065 | 1032 |
} |
1033 |
||
1034 |
//**********************************************// |
|
1035 |
// M E M O R Y I N I T I A L I S A T I O N // |
|
1036 |
//**********************************************// |
|
1037 |
||
1038 |
static void pd_fill_to_bytes(void* to, size_t count, jubyte value) { |
|
1039 |
// JVM2008: very rare, only in some tests. |
|
1040 |
#ifdef USE_INLINE_ASM |
|
1041 |
// Initialize storage to a given value. Use memset instead of copy loop. |
|
1042 |
// For large chunks of memory, exploit special H/W support of z/Architecture: |
|
1043 |
// 1) init short piece of memory to page-align address |
|
1044 |
// 2) init largest part (all contained full pages) of memory using mvcle instruction. |
|
1045 |
// z/Architecture processors have special H/W support for page-aligned storage |
|
1046 |
// where len is an int multiple of page size. In that case, up to 4 cache lines are |
|
1047 |
// processed in parallel and L1 cache is not polluted. |
|
1048 |
// 3) init the remaining piece of memory. |
|
1049 |
// Atomicity cannot really be an issue since gcc implements the loop body with XC anyway. |
|
1050 |
// If atomicity is a problem, we have to prevent gcc optimization. Best workaround: inline asm. |
|
1051 |
||
1052 |
jbyte* to_bytes = (jbyte*)to; |
|
1053 |
size_t len_bytes = count; |
|
1054 |
||
1055 |
MVCLE_MEMINIT(to_bytes, value, len_bytes) |
|
1056 |
||
1057 |
#else |
|
1058 |
// Memset does the best job possible: loop over 256-byte MVCs, with |
|
1059 |
// the last MVC EXecuted. With the -mmvcle option, initialization |
|
1060 |
// is done using MVCLE -> slight advantage for large areas. |
|
1061 |
(void)memset(to, value, count); |
|
1062 |
#endif |
|
1063 |
} |
|
1064 |
||
1065 |
static void pd_fill_to_words(HeapWord* tohw, size_t count, juint value) { |
|
1066 |
// Occurs in dbg builds only. Usually memory poisoning with BAADBABE, DEADBEEF, etc. |
|
1067 |
// JVM2008: < 4k calls. |
|
1068 |
if (value == 0) { |
|
1069 |
pd_zero_to_words(tohw, count); |
|
1070 |
return; |
|
1071 |
} |
|
1072 |
if (value == ~(juint)(0)) { |
|
1073 |
pd_fill_to_bytes(tohw, count*HeapWordSize, (jubyte)(~(juint)(0))); |
|
1074 |
return; |
|
1075 |
} |
|
1076 |
julong* to = (julong*) tohw; |
|
1077 |
julong v = ((julong) value << 32) | value; |
|
1078 |
while (count-- > 0) { |
|
1079 |
*to++ = v; |
|
1080 |
} |
|
1081 |
} |
|
1082 |
||
1083 |
static void pd_fill_to_aligned_words(HeapWord* tohw, size_t count, juint value) { |
|
1084 |
// JVM2008: very frequent, but virtually all calls are with value == 0. |
|
1085 |
pd_fill_to_words(tohw, count, value); |
|
1086 |
} |
|
1087 |
||
1088 |
//**********************************// |
|
1089 |
// M E M O R Y C L E A R I N G // |
|
1090 |
//**********************************// |
|
1091 |
||
1092 |
// Delegate to pd_zero_to_bytes. It also works HeapWord-atomic. |
|
1093 |
// Distinguish between simple and large zero_to_words. |
|
1094 |
static void pd_zero_to_words(HeapWord* tohw, size_t count) { |
|
1095 |
pd_zero_to_bytes(tohw, count*HeapWordSize); |
|
1096 |
} |
|
1097 |
||
1098 |
// Delegate to pd_zero_to_bytes. It also works HeapWord-atomic. |
|
1099 |
static void pd_zero_to_words_large(HeapWord* tohw, size_t count) { |
|
1100 |
// JVM2008: generally frequent, some tests show very frequent calls. |
|
1101 |
pd_zero_to_bytes(tohw, count*HeapWordSize); |
|
1102 |
} |
|
1103 |
||
1104 |
static void pd_zero_to_bytes(void* to, size_t count) { |
|
1105 |
// JVM2008: some calls (generally), some tests frequent |
|
1106 |
#ifdef USE_INLINE_ASM |
|
1107 |
// Even zero_to_bytes() requires HeapWord-atomic, or, at least, sequential |
|
1108 |
// zeroing of the memory. MVCLE is not fit for that job: |
|
1109 |
// "As observed by other CPUs and by the channel subsystem, |
|
1110 |
// that portion of the first operand which is filled |
|
1111 |
// with the padding byte is not necessarily stored into in |
|
1112 |
// a left-to-right direction and may appear to be stored |
|
1113 |
// into more than once." |
|
1114 |
// Therefore, implementation was changed to use (multiple) XC instructions. |
|
1115 |
||
1116 |
const long line_size = 256; |
|
1117 |
jbyte* to_bytes = (jbyte*)to; |
|
1118 |
size_t len_bytes = count; |
|
1119 |
||
1120 |
if (len_bytes <= line_size) { |
|
1121 |
XC_MEMZERO_256(to_bytes, len_bytes); |
|
1122 |
} else { |
|
1123 |
XC_MEMZERO_ANY(to_bytes, len_bytes); |
|
1124 |
} |
|
1125 |
||
1126 |
#else |
|
1127 |
// Memset does the best job possible: loop over 256-byte MVCs, with |
|
1128 |
// the last MVC EXecuted. With the -mmvcle option, initialization |
|
1129 |
// is done using MVCLE -> slight advantage for large areas. |
|
1130 |
(void)memset(to, 0, count); |
|
1131 |
#endif |
|
1132 |
} |
|
1133 |
||
53244
9807daeb47c4
8216167: Update include guards to reflect correct directories
coleenp
parents:
48956
diff
changeset
|
1134 |
#endif // CPU_S390_COPY_S390_HPP |