|
1 /* |
|
2 * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. |
|
3 * Copyright (c) 2016 SAP SE. All rights reserved. |
|
4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. |
|
5 * |
|
6 * This code is free software; you can redistribute it and/or modify it |
|
7 * under the terms of the GNU General Public License version 2 only, as |
|
8 * published by the Free Software Foundation. |
|
9 * |
|
10 * This code is distributed in the hope that it will be useful, but WITHOUT |
|
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
|
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License |
|
13 * version 2 for more details (a copy is included in the LICENSE file that |
|
14 * accompanied this code). |
|
15 * |
|
16 * You should have received a copy of the GNU General Public License version |
|
17 * 2 along with this work; if not, write to the Free Software Foundation, |
|
18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. |
|
19 * |
|
20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA |
|
21 * or visit www.oracle.com if you need additional information or have any |
|
22 * questions. |
|
23 * |
|
24 */ |
|
25 |
|
26 // Major contributions by LS |
|
27 |
|
28 #ifndef CPU_S390_VM_COPY_S390_HPP |
|
29 #define CPU_S390_VM_COPY_S390_HPP |
|
30 |
|
31 // Inline functions for memory copy and fill. |
|
32 |
|
33 // HeapWordSize (the size of class HeapWord) is 8 Bytes (the size of a |
|
34 // pointer variable), since we always run the _LP64 model. As a consequence, |
|
35 // HeapWord* memory ranges are always assumed to be doubleword-aligned, |
|
36 // having a size which is an integer multiple of HeapWordSize. |
|
37 // |
|
38 // Dealing only with doubleword-aligned doubleword units has important |
|
39 // positive performance and data access consequences. Many of the move |
|
40 // instructions perform particularly well under these circumstances. |
|
41 // Data access is "doubleword-concurrent", except for MVC and XC. |
|
42 // Furthermore, data access can be forced to be sequential (MVCL and MVCLE) |
|
43 // by use of the special padding byte 0xb1, where required. For copying, |
|
44 // we use padding byte 0xb0 to prevent the D-cache from being polluted. |
|
45 // |
|
46 // On z/Architecture, gcc optimizes memcpy into a series of MVC instructions. |
|
47 // This is optimal, even if just one HeapWord is copied. However, MVC |
|
48 // copying is not atomic, i.e. not "doubleword concurrent" by definition. |
|
49 // |
|
50 // If the -mmvcle compiler option is specified, memcpy translates into |
|
51 // code such that the entire memory range is copied or preset with just |
|
52 // one MVCLE instruction. |
|
53 // |
|
54 // *to = *from is transformed into a MVC instruction already with -O1. |
|
55 // Thus, for atomic copy operations, (inline) assembler code is required |
|
56 // to guarantee atomic data accesses. |
|
57 // |
|
58 // For large (len >= MVCLEThreshold) chunks of memory, we exploit |
|
59 // special H/W support of z/Architecture: |
|
60 // 1) copy short piece of memory to page-align address(es) |
|
61 // 2) copy largest part (all contained full pages) of memory using mvcle instruction. |
|
62 // z/Architecture processors have special H/W support for page-aligned storage |
|
63 // where len is an int multiple of page size. In that case, up to 4 cache lines are |
|
64 // processed in parallel and L1 cache is not polluted. |
|
65 // 3) copy the remaining piece of memory. |
|
66 // |
|
67 // Measurement classifications: |
|
68 // very rare - <= 10.000 calls AND <= 1.000 usec elapsed |
|
69 // rare - <= 100.000 calls AND <= 10.000 usec elapsed |
|
70 // some - <= 1.000.000 calls AND <= 100.000 usec elapsed |
|
71 // freq - <= 10.000.000 calls AND <= 1.000.000 usec elapsed |
|
72 // very freq - > 10.000.000 calls OR > 1.000.000 usec elapsed |
|
73 |
|
74 #undef USE_INLINE_ASM |
|
75 |
|
76 static void copy_conjoint_jshorts_atomic(jshort* from, jshort* to, size_t count) { |
|
77 if (from > to) { |
|
78 while (count-- > 0) { |
|
79 // Copy forwards |
|
80 *to++ = *from++; |
|
81 } |
|
82 } else { |
|
83 from += count - 1; |
|
84 to += count - 1; |
|
85 while (count-- > 0) { |
|
86 // Copy backwards |
|
87 *to-- = *from--; |
|
88 } |
|
89 } |
|
90 } |
|
91 |
|
92 static void copy_conjoint_jints_atomic(jint* from, jint* to, size_t count) { |
|
93 if (from > to) { |
|
94 while (count-- > 0) { |
|
95 // Copy forwards |
|
96 *to++ = *from++; |
|
97 } |
|
98 } else { |
|
99 from += count - 1; |
|
100 to += count - 1; |
|
101 while (count-- > 0) { |
|
102 // Copy backwards |
|
103 *to-- = *from--; |
|
104 } |
|
105 } |
|
106 } |
|
107 |
|
108 static bool has_destructive_overlap(char* from, char* to, size_t byte_count) { |
|
109 return (from < to) && ((to-from) < (ptrdiff_t)byte_count); |
|
110 } |
|
111 |
|
112 #ifdef USE_INLINE_ASM |
|
113 |
|
114 //-------------------------------------------------------------- |
|
115 // Atomic copying. Atomicity is given by the minimum of source |
|
116 // and target alignment. Refer to mail comm with Tim Slegel/IBM. |
|
117 // Only usable for disjoint source and target. |
|
118 //-------------------------------------------------------------- |
|
119 #define MOVE8_ATOMIC_4(_to,_from) { \ |
|
120 unsigned long toaddr; \ |
|
121 unsigned long fromaddr; \ |
|
122 asm( \ |
|
123 "LG %[toaddr],%[to] \n\t" /* address of to area */ \ |
|
124 "LG %[fromaddr],%[from] \n\t" /* address of from area */ \ |
|
125 "MVC 0(32,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \ |
|
126 : [to] "+Q" (_to) /* outputs */ \ |
|
127 , [from] "+Q" (_from) \ |
|
128 , [toaddr] "=a" (toaddr) \ |
|
129 , [fromaddr] "=a" (fromaddr) \ |
|
130 : \ |
|
131 : "cc" /* clobbered */ \ |
|
132 ); \ |
|
133 } |
|
134 #define MOVE8_ATOMIC_3(_to,_from) { \ |
|
135 unsigned long toaddr; \ |
|
136 unsigned long fromaddr; \ |
|
137 asm( \ |
|
138 "LG %[toaddr],%[to] \n\t" /* address of to area */ \ |
|
139 "LG %[fromaddr],%[from] \n\t" /* address of from area */ \ |
|
140 "MVC 0(24,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \ |
|
141 : [to] "+Q" (_to) /* outputs */ \ |
|
142 , [from] "+Q" (_from) \ |
|
143 , [toaddr] "=a" (toaddr) \ |
|
144 , [fromaddr] "=a" (fromaddr) \ |
|
145 : \ |
|
146 : "cc" /* clobbered */ \ |
|
147 ); \ |
|
148 } |
|
149 #define MOVE8_ATOMIC_2(_to,_from) { \ |
|
150 unsigned long toaddr; \ |
|
151 unsigned long fromaddr; \ |
|
152 asm( \ |
|
153 "LG %[toaddr],%[to] \n\t" /* address of to area */ \ |
|
154 "LG %[fromaddr],%[from] \n\t" /* address of from area */ \ |
|
155 "MVC 0(16,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \ |
|
156 : [to] "+Q" (_to) /* outputs */ \ |
|
157 , [from] "+Q" (_from) \ |
|
158 , [toaddr] "=a" (toaddr) \ |
|
159 , [fromaddr] "=a" (fromaddr) \ |
|
160 : \ |
|
161 : "cc" /* clobbered */ \ |
|
162 ); \ |
|
163 } |
|
164 #define MOVE8_ATOMIC_1(_to,_from) { \ |
|
165 unsigned long toaddr; \ |
|
166 unsigned long fromaddr; \ |
|
167 asm( \ |
|
168 "LG %[toaddr],%[to] \n\t" /* address of to area */ \ |
|
169 "LG %[fromaddr],%[from] \n\t" /* address of from area */ \ |
|
170 "MVC 0(8,%[toaddr]),0(%[fromaddr]) \n\t" /* move data */ \ |
|
171 : [to] "+Q" (_to) /* outputs */ \ |
|
172 , [from] "+Q" (_from) \ |
|
173 , [toaddr] "=a" (toaddr) \ |
|
174 , [fromaddr] "=a" (fromaddr) \ |
|
175 : \ |
|
176 : "cc" /* clobbered */ \ |
|
177 ); \ |
|
178 } |
|
179 |
|
180 //-------------------------------------------------------------- |
|
181 // Atomic copying of 8-byte entities. |
|
182 // Conjoint/disjoint property does not matter. Entities are first |
|
183 // loaded and then stored. |
|
184 // _to and _from must be 8-byte aligned. |
|
185 //-------------------------------------------------------------- |
|
186 #define COPY8_ATOMIC_4(_to,_from) { \ |
|
187 unsigned long toaddr; \ |
|
188 asm( \ |
|
189 "LG 3,%[from] \n\t" /* address of from area */ \ |
|
190 "LG %[toaddr],%[to] \n\t" /* address of to area */ \ |
|
191 "LMG 0,3,0(3) \n\t" /* load data */ \ |
|
192 "STMG 0,3,0(%[toaddr]) \n\t" /* store data */ \ |
|
193 : [to] "+Q" (_to) /* outputs */ \ |
|
194 , [from] "+Q" (_from) /* outputs */ \ |
|
195 , [toaddr] "=a" (toaddr) /* inputs */ \ |
|
196 : \ |
|
197 : "cc", "r0", "r1", "r2", "r3" /* clobbered */ \ |
|
198 ); \ |
|
199 } |
|
200 #define COPY8_ATOMIC_3(_to,_from) { \ |
|
201 unsigned long toaddr; \ |
|
202 asm( \ |
|
203 "LG 2,%[from] \n\t" /* address of from area */ \ |
|
204 "LG %[toaddr],%[to] \n\t" /* address of to area */ \ |
|
205 "LMG 0,2,0(2) \n\t" /* load data */ \ |
|
206 "STMG 0,2,0(%[toaddr]) \n\t" /* store data */ \ |
|
207 : [to] "+Q" (_to) /* outputs */ \ |
|
208 , [from] "+Q" (_from) /* outputs */ \ |
|
209 , [toaddr] "=a" (toaddr) /* inputs */ \ |
|
210 : \ |
|
211 : "cc", "r0", "r1", "r2" /* clobbered */ \ |
|
212 ); \ |
|
213 } |
|
214 #define COPY8_ATOMIC_2(_to,_from) { \ |
|
215 unsigned long toaddr; \ |
|
216 asm( \ |
|
217 "LG 1,%[from] \n\t" /* address of from area */ \ |
|
218 "LG %[toaddr],%[to] \n\t" /* address of to area */ \ |
|
219 "LMG 0,1,0(1) \n\t" /* load data */ \ |
|
220 "STMG 0,1,0(%[toaddr]) \n\t" /* store data */ \ |
|
221 : [to] "+Q" (_to) /* outputs */ \ |
|
222 , [from] "+Q" (_from) /* outputs */ \ |
|
223 , [toaddr] "=a" (toaddr) /* inputs */ \ |
|
224 : \ |
|
225 : "cc", "r0", "r1" /* clobbered */ \ |
|
226 ); \ |
|
227 } |
|
228 #define COPY8_ATOMIC_1(_to,_from) { \ |
|
229 unsigned long addr; \ |
|
230 asm( \ |
|
231 "LG %[addr],%[from] \n\t" /* address of from area */ \ |
|
232 "LG 0,0(0,%[addr]) \n\t" /* load data */ \ |
|
233 "LG %[addr],%[to] \n\t" /* address of to area */ \ |
|
234 "STG 0,0(0,%[addr]) \n\t" /* store data */ \ |
|
235 : [to] "+Q" (_to) /* outputs */ \ |
|
236 , [from] "+Q" (_from) /* outputs */ \ |
|
237 , [addr] "=a" (addr) /* inputs */ \ |
|
238 : \ |
|
239 : "cc", "r0" /* clobbered */ \ |
|
240 ); \ |
|
241 } |
|
242 |
|
243 //-------------------------------------------------------------- |
|
244 // Atomic copying of 4-byte entities. |
|
245 // Exactly 4 (four) entities are copied. |
|
246 // Conjoint/disjoint property does not matter. Entities are first |
|
247 // loaded and then stored. |
|
248 // _to and _from must be 4-byte aligned. |
|
249 //-------------------------------------------------------------- |
|
250 #define COPY4_ATOMIC_4(_to,_from) { \ |
|
251 unsigned long toaddr; \ |
|
252 asm( \ |
|
253 "LG 3,%[from] \n\t" /* address of from area */ \ |
|
254 "LG %[toaddr],%[to] \n\t" /* address of to area */ \ |
|
255 "LM 0,3,0(3) \n\t" /* load data */ \ |
|
256 "STM 0,3,0(%[toaddr]) \n\t" /* store data */ \ |
|
257 : [to] "+Q" (_to) /* outputs */ \ |
|
258 , [from] "+Q" (_from) /* outputs */ \ |
|
259 , [toaddr] "=a" (toaddr) /* inputs */ \ |
|
260 : \ |
|
261 : "cc", "r0", "r1", "r2", "r3" /* clobbered */ \ |
|
262 ); \ |
|
263 } |
|
264 #define COPY4_ATOMIC_3(_to,_from) { \ |
|
265 unsigned long toaddr; \ |
|
266 asm( \ |
|
267 "LG 2,%[from] \n\t" /* address of from area */ \ |
|
268 "LG %[toaddr],%[to] \n\t" /* address of to area */ \ |
|
269 "LM 0,2,0(2) \n\t" /* load data */ \ |
|
270 "STM 0,2,0(%[toaddr]) \n\t" /* store data */ \ |
|
271 : [to] "+Q" (_to) /* outputs */ \ |
|
272 , [from] "+Q" (_from) /* outputs */ \ |
|
273 , [toaddr] "=a" (toaddr) /* inputs */ \ |
|
274 : \ |
|
275 : "cc", "r0", "r1", "r2" /* clobbered */ \ |
|
276 ); \ |
|
277 } |
|
278 #define COPY4_ATOMIC_2(_to,_from) { \ |
|
279 unsigned long toaddr; \ |
|
280 asm( \ |
|
281 "LG 1,%[from] \n\t" /* address of from area */ \ |
|
282 "LG %[toaddr],%[to] \n\t" /* address of to area */ \ |
|
283 "LM 0,1,0(1) \n\t" /* load data */ \ |
|
284 "STM 0,1,0(%[toaddr]) \n\t" /* store data */ \ |
|
285 : [to] "+Q" (_to) /* outputs */ \ |
|
286 , [from] "+Q" (_from) /* outputs */ \ |
|
287 , [toaddr] "=a" (toaddr) /* inputs */ \ |
|
288 : \ |
|
289 : "cc", "r0", "r1" /* clobbered */ \ |
|
290 ); \ |
|
291 } |
|
292 #define COPY4_ATOMIC_1(_to,_from) { \ |
|
293 unsigned long addr; \ |
|
294 asm( \ |
|
295 "LG %[addr],%[from] \n\t" /* address of from area */ \ |
|
296 "L 0,0(0,%[addr]) \n\t" /* load data */ \ |
|
297 "LG %[addr],%[to] \n\t" /* address of to area */ \ |
|
298 "ST 0,0(0,%[addr]) \n\t" /* store data */ \ |
|
299 : [to] "+Q" (_to) /* outputs */ \ |
|
300 , [from] "+Q" (_from) /* outputs */ \ |
|
301 , [addr] "=a" (addr) /* inputs */ \ |
|
302 : \ |
|
303 : "cc", "r0" /* clobbered */ \ |
|
304 ); \ |
|
305 } |
|
306 |
|
307 #if 0 // Waiting for gcc to support EXRL. |
|
308 #define MVC_MEMCOPY(_to,_from,_len) \ |
|
309 if (VM_Version::has_ExecuteExtensions()) { \ |
|
310 asm("\t" \ |
|
311 " LAY 1,-1(0,%[len]) \n\t" /* decr for MVC */ \ |
|
312 " EXRL 1,1f \n\t" /* execute MVC instr */ \ |
|
313 " BRC 15,2f \n\t" /* skip template */ \ |
|
314 "1: MVC 0(%[len],%[to]),0(%[from]) \n\t" \ |
|
315 "2: BCR 0,0 \n\t" \ |
|
316 : [to] "+Q" (_to) /* outputs */ \ |
|
317 , [from] "+Q" (_from) /* outputs */ \ |
|
318 : [len] "r" (_len) /* inputs */ \ |
|
319 : "cc", "r1" /* clobbered */ \ |
|
320 ); \ |
|
321 } else { \ |
|
322 asm("\t" \ |
|
323 " LARL 2,3f \n\t" \ |
|
324 " LAY 1,-1(0,%[len]) \n\t" /* decr for MVC */ \ |
|
325 " EX 1,0(2) \n\t" /* execute MVC instr */ \ |
|
326 " BRC 15,4f \n\t" /* skip template */ \ |
|
327 "3: MVC 0(%[len],%[to]),0(%[from]) \n\t" \ |
|
328 "4: BCR 0,0 \n\t" \ |
|
329 : [to] "+Q" (_to) /* outputs */ \ |
|
330 , [from] "+Q" (_from) /* outputs */ \ |
|
331 : [len] "r" (_len) /* inputs */ \ |
|
332 : "cc", "r1", "r2" /* clobbered */ \ |
|
333 ); \ |
|
334 } |
|
335 #else |
|
336 #define MVC_MEMCOPY(_to,_from,_len) \ |
|
337 { unsigned long toaddr; unsigned long tolen; \ |
|
338 unsigned long fromaddr; unsigned long target; \ |
|
339 asm("\t" \ |
|
340 " LTGR %[tolen],%[len] \n\t" /* decr for MVC */ \ |
|
341 " BRC 8,2f \n\t" /* do nothing for l=0*/ \ |
|
342 " AGHI %[tolen],-1 \n\t" \ |
|
343 " LG %[toaddr],%[to] \n\t" \ |
|
344 " LG %[fromaddr],%[from] \n\t" \ |
|
345 " LARL %[target],1f \n\t" /* addr of MVC instr */ \ |
|
346 " EX %[tolen],0(%[target]) \n\t" /* execute MVC instr */ \ |
|
347 " BRC 15,2f \n\t" /* skip template */ \ |
|
348 "1: MVC 0(1,%[toaddr]),0(%[fromaddr]) \n\t" \ |
|
349 "2: BCR 0,0 \n\t" /* nop a branch target*/\ |
|
350 : [to] "+Q" (_to) /* outputs */ \ |
|
351 , [from] "+Q" (_from) \ |
|
352 , [tolen] "=a" (tolen) \ |
|
353 , [toaddr] "=a" (toaddr) \ |
|
354 , [fromaddr] "=a" (fromaddr) \ |
|
355 , [target] "=a" (target) \ |
|
356 : [len] "r" (_len) /* inputs */ \ |
|
357 : "cc" /* clobbered */ \ |
|
358 ); \ |
|
359 } |
|
360 #endif |
|
361 |
|
362 #if 0 // code snippet to be used for debugging |
|
363 /* ASSERT code BEGIN */ \ |
|
364 " LARL %[len],5f \n\t" \ |
|
365 " LARL %[mta],4f \n\t" \ |
|
366 " SLGR %[len],%[mta] \n\t" \ |
|
367 " CGHI %[len],16 \n\t" \ |
|
368 " BRC 7,9f \n\t" /* block size != 16 */ \ |
|
369 \ |
|
370 " LARL %[len],1f \n\t" \ |
|
371 " SLGR %[len],%[mta] \n\t" \ |
|
372 " CGHI %[len],256 \n\t" \ |
|
373 " BRC 7,9f \n\t" /* list len != 256 */ \ |
|
374 \ |
|
375 " LGR 0,0 \n\t" /* artificial SIGILL */ \ |
|
376 "9: BRC 7,-2 \n\t" \ |
|
377 " LARL %[mta],1f \n\t" /* restore MVC table begin */ \ |
|
378 /* ASSERT code END */ |
|
379 #endif |
|
380 |
|
381 // Optimized copying for data less than 4k |
|
382 // - no destructive overlap |
|
383 // - 0 <= _n_bytes <= 4096 |
|
384 // This macro needs to be gcc-compiled with -march=z990. Otherwise, the |
|
385 // LAY instruction is not available. |
|
386 #define MVC_MULTI(_to,_from,_n_bytes) \ |
|
387 { unsigned long toaddr; \ |
|
388 unsigned long fromaddr; \ |
|
389 unsigned long movetable; \ |
|
390 unsigned long len; \ |
|
391 asm("\t" \ |
|
392 " LTGFR %[len],%[nby] \n\t" \ |
|
393 " LG %[ta],%[to] \n\t" /* address of to area */ \ |
|
394 " BRC 8,1f \n\t" /* nothing to copy */ \ |
|
395 \ |
|
396 " NILL %[nby],255 \n\t" /* # bytes mod 256 */ \ |
|
397 " LG %[fa],%[from] \n\t" /* address of from area */ \ |
|
398 " BRC 8,3f \n\t" /* no rest, skip copying */ \ |
|
399 \ |
|
400 " LARL %[mta],2f \n\t" /* MVC template addr */ \ |
|
401 " AHI %[nby],-1 \n\t" /* adjust for EX MVC */ \ |
|
402 \ |
|
403 " EX %[nby],0(%[mta]) \n\t" /* only rightmost */ \ |
|
404 /* 8 bits of nby used */ \ |
|
405 /* Since nby is <= 4096 on entry to this code, we do need */ \ |
|
406 /* no zero extension before using it in addr calc. */ \ |
|
407 " LA %[fa],1(%[nby],%[fa]) \n\t"/* adjust from addr */ \ |
|
408 " LA %[ta],1(%[nby],%[ta]) \n\t"/* adjust to addr */ \ |
|
409 \ |
|
410 "3: SRAG %[nby],%[len],8 \n\t" /* # cache lines */ \ |
|
411 " LARL %[mta],1f \n\t" /* MVC table begin */ \ |
|
412 " BRC 8,1f \n\t" /* nothing to copy */ \ |
|
413 \ |
|
414 /* Insert ASSERT code here if required. */ \ |
|
415 \ |
|
416 \ |
|
417 " LNGFR %[nby],%[nby] \n\t" /* negative offset into */ \ |
|
418 " SLLG %[nby],%[nby],4 \n\t" /* MVC table 16-byte blocks */ \ |
|
419 " BC 15,0(%[nby],%[mta]) \n\t" /* branch to block #ncl */ \ |
|
420 \ |
|
421 "2: MVC 0(1,%[ta]),0(%[fa]) \n\t" /* MVC template */ \ |
|
422 \ |
|
423 "4: MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 4096 == l */ \ |
|
424 " LAY %[ta],256(0,%[ta]) \n\t" \ |
|
425 " LA %[fa],256(0,%[fa]) \n\t" \ |
|
426 "5: MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 3840 <= l < 4096 */ \ |
|
427 " LAY %[ta],256(0,%[ta]) \n\t" \ |
|
428 " LA %[fa],256(0,%[fa]) \n\t" \ |
|
429 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 3548 <= l < 3328 */ \ |
|
430 " LAY %[ta],256(0,%[ta]) \n\t" \ |
|
431 " LA %[fa],256(0,%[fa]) \n\t" \ |
|
432 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 3328 <= l < 3328 */ \ |
|
433 " LAY %[ta],256(0,%[ta]) \n\t" \ |
|
434 " LA %[fa],256(0,%[fa]) \n\t" \ |
|
435 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 3072 <= l < 3328 */ \ |
|
436 " LAY %[ta],256(0,%[ta]) \n\t" \ |
|
437 " LA %[fa],256(0,%[fa]) \n\t" \ |
|
438 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 2816 <= l < 3072 */ \ |
|
439 " LAY %[ta],256(0,%[ta]) \n\t" \ |
|
440 " LA %[fa],256(0,%[fa]) \n\t" \ |
|
441 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 2560 <= l < 2816 */ \ |
|
442 " LAY %[ta],256(0,%[ta]) \n\t" \ |
|
443 " LA %[fa],256(0,%[fa]) \n\t" \ |
|
444 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 2304 <= l < 2560 */ \ |
|
445 " LAY %[ta],256(0,%[ta]) \n\t" \ |
|
446 " LA %[fa],256(0,%[fa]) \n\t" \ |
|
447 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 2048 <= l < 2304 */ \ |
|
448 " LAY %[ta],256(0,%[ta]) \n\t" \ |
|
449 " LA %[fa],256(0,%[fa]) \n\t" \ |
|
450 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 1792 <= l < 2048 */ \ |
|
451 " LAY %[ta],256(0,%[ta]) \n\t" \ |
|
452 " LA %[fa],256(0,%[fa]) \n\t" \ |
|
453 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 1536 <= l < 1792 */ \ |
|
454 " LAY %[ta],256(0,%[ta]) \n\t" \ |
|
455 " LA %[fa],256(0,%[fa]) \n\t" \ |
|
456 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 1280 <= l < 1536 */ \ |
|
457 " LAY %[ta],256(0,%[ta]) \n\t" \ |
|
458 " LA %[fa],256(0,%[fa]) \n\t" \ |
|
459 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 1024 <= l < 1280 */ \ |
|
460 " LAY %[ta],256(0,%[ta]) \n\t" \ |
|
461 " LA %[fa],256(0,%[fa]) \n\t" \ |
|
462 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 768 <= l < 1024 */ \ |
|
463 " LAY %[ta],256(0,%[ta]) \n\t" \ |
|
464 " LA %[fa],256(0,%[fa]) \n\t" \ |
|
465 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 512 <= l < 768 */ \ |
|
466 " LAY %[ta],256(0,%[ta]) \n\t" \ |
|
467 " LA %[fa],256(0,%[fa]) \n\t" \ |
|
468 " MVC 0(256,%[ta]),0(%[fa]) \n\t" /* 256 <= l < 512 */ \ |
|
469 " LAY %[ta],256(0,%[ta]) \n\t" \ |
|
470 " LA %[fa],256(0,%[fa]) \n\t" \ |
|
471 "1: BCR 0,0 \n\t" /* nop as branch target */ \ |
|
472 : [to] "+Q" (_to) /* outputs */ \ |
|
473 , [from] "+Q" (_from) \ |
|
474 , [ta] "=a" (toaddr) \ |
|
475 , [fa] "=a" (fromaddr) \ |
|
476 , [mta] "=a" (movetable) \ |
|
477 , [nby] "+a" (_n_bytes) \ |
|
478 , [len] "=a" (len) \ |
|
479 : \ |
|
480 : "cc" /* clobbered */ \ |
|
481 ); \ |
|
482 } |
|
483 |
|
484 #define MVCLE_MEMCOPY(_to,_from,_len) \ |
|
485 asm( \ |
|
486 " LG 0,%[to] \n\t" /* address of to area */ \ |
|
487 " LG 2,%[from] \n\t" /* address of from area */ \ |
|
488 " LGR 1,%[len] \n\t" /* len of to area */ \ |
|
489 " LGR 3,%[len] \n\t" /* len of from area */ \ |
|
490 "1: MVCLE 0,2,176 \n\t" /* copy storage, bypass cache (0xb0) */ \ |
|
491 " BRC 1,1b \n\t" /* retry if interrupted */ \ |
|
492 : [to] "+Q" (_to) /* outputs */ \ |
|
493 , [from] "+Q" (_from) /* outputs */ \ |
|
494 : [len] "r" (_len) /* inputs */ \ |
|
495 : "cc", "r0", "r1", "r2", "r3" /* clobbered */ \ |
|
496 ); |
|
497 |
|
498 #define MVCLE_MEMINIT(_to,_val,_len) \ |
|
499 asm( \ |
|
500 " LG 0,%[to] \n\t" /* address of to area */ \ |
|
501 " LGR 1,%[len] \n\t" /* len of to area */ \ |
|
502 " XGR 3,3 \n\t" /* from area len = 0 */ \ |
|
503 "1: MVCLE 0,2,0(%[val]) \n\t" /* init storage */ \ |
|
504 " BRC 1,1b \n\t" /* retry if interrupted */ \ |
|
505 : [to] "+Q" (_to) /* outputs */ \ |
|
506 : [len] "r" (_len) /* inputs */ \ |
|
507 , [val] "r" (_val) /* inputs */ \ |
|
508 : "cc", "r0", "r1", "r3" /* clobbered */ \ |
|
509 ); |
|
510 #define MVCLE_MEMZERO(_to,_len) \ |
|
511 asm( \ |
|
512 " LG 0,%[to] \n\t" /* address of to area */ \ |
|
513 " LGR 1,%[len] \n\t" /* len of to area */ \ |
|
514 " XGR 3,3 \n\t" /* from area len = 0 */ \ |
|
515 "1: MVCLE 0,2,0 \n\t" /* clear storage */ \ |
|
516 " BRC 1,1b \n\t" /* retry if interrupted */ \ |
|
517 : [to] "+Q" (_to) /* outputs */ \ |
|
518 : [len] "r" (_len) /* inputs */ \ |
|
519 : "cc", "r0", "r1", "r3" /* clobbered */ \ |
|
520 ); |
|
521 |
|
522 // Clear a stretch of memory, 0 <= _len <= 256. |
|
523 // There is no alignment prereq. |
|
524 // There is no test for len out of range specified above. |
|
525 #define XC_MEMZERO_256(_to,_len) \ |
|
526 { unsigned long toaddr; unsigned long tolen; \ |
|
527 unsigned long target; \ |
|
528 asm("\t" \ |
|
529 " LTGR %[tolen],%[len] \n\t" /* decr for MVC */ \ |
|
530 " BRC 8,2f \n\t" /* do nothing for l=0*/ \ |
|
531 " AGHI %[tolen],-1 \n\t" /* adjust for EX XC */ \ |
|
532 " LARL %[target],1f \n\t" /* addr of XC instr */ \ |
|
533 " LG %[toaddr],%[to] \n\t" /* addr of data area */ \ |
|
534 " EX %[tolen],0(%[target]) \n\t" /* execute MVC instr */ \ |
|
535 " BRC 15,2f \n\t" /* skip template */ \ |
|
536 "1: XC 0(1,%[toaddr]),0(%[toaddr]) \n\t" \ |
|
537 "2: BCR 0,0 \n\t" /* nop a branch target*/\ |
|
538 : [to] "+Q" (_to) /* outputs */ \ |
|
539 , [tolen] "=a" (tolen) \ |
|
540 , [toaddr] "=a" (toaddr) \ |
|
541 , [target] "=a" (target) \ |
|
542 : [len] "r" (_len) /* inputs */ \ |
|
543 : "cc" /* clobbered */ \ |
|
544 ); \ |
|
545 } |
|
546 |
|
547 // Clear a stretch of memory, 256 < _len. |
|
548 // XC_MEMZERO_256 may be used to clear shorter areas. |
|
549 // |
|
550 // The code |
|
551 // - first zeroes a few bytes to align on a HeapWord. |
|
552 // This step is currently inactive because all calls seem |
|
553 // to have their data aligned on HeapWord boundaries. |
|
554 // - then zeroes a few HeapWords to align on a cache line. |
|
555 // - then zeroes entire cache lines in a loop. |
|
556 // - then zeroes the remaining (partial) cache line. |
|
557 #if 1 |
|
558 #define XC_MEMZERO_ANY(_to,_len) \ |
|
559 { unsigned long toaddr; unsigned long tolen; \ |
|
560 unsigned long len8; unsigned long len256; \ |
|
561 unsigned long target; unsigned long lenx; \ |
|
562 asm("\t" \ |
|
563 " LTGR %[tolen],%[len] \n\t" /* */ \ |
|
564 " BRC 8,2f \n\t" /* do nothing for l=0*/ \ |
|
565 " LG %[toaddr],%[to] \n\t" /* addr of data area */ \ |
|
566 " LARL %[target],1f \n\t" /* addr of XC instr */ \ |
|
567 " " \ |
|
568 " LCGR %[len256],%[toaddr] \n\t" /* cache line alignment */\ |
|
569 " NILL %[len256],0xff \n\t" \ |
|
570 " BRC 8,4f \n\t" /* already aligned */ \ |
|
571 " NILH %[len256],0x00 \n\t" /* zero extend */ \ |
|
572 " LLGFR %[len256],%[len256] \n\t" \ |
|
573 " LAY %[lenx],-1(,%[len256]) \n\t" \ |
|
574 " EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \ |
|
575 " LA %[toaddr],0(%[len256],%[toaddr]) \n\t" \ |
|
576 " SGR %[tolen],%[len256] \n\t" /* adjust len */ \ |
|
577 " " \ |
|
578 "4: SRAG %[lenx],%[tolen],8 \n\t" /* # cache lines */ \ |
|
579 " BRC 8,6f \n\t" /* no full cache lines */ \ |
|
580 "5: XC 0(256,%[toaddr]),0(%[toaddr]) \n\t" \ |
|
581 " LA %[toaddr],256(,%[toaddr]) \n\t" \ |
|
582 " BRCTG %[lenx],5b \n\t" /* iterate */ \ |
|
583 " " \ |
|
584 "6: NILL %[tolen],0xff \n\t" /* leftover bytes */ \ |
|
585 " BRC 8,2f \n\t" /* done if none */ \ |
|
586 " LAY %[lenx],-1(,%[tolen]) \n\t" \ |
|
587 " EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \ |
|
588 " BRC 15,2f \n\t" /* skip template */ \ |
|
589 " " \ |
|
590 "1: XC 0(1,%[toaddr]),0(%[toaddr]) \n\t" \ |
|
591 "2: BCR 0,0 \n\t" /* nop a branch target */ \ |
|
592 : [to] "+Q" (_to) /* outputs */ \ |
|
593 , [lenx] "=a" (lenx) \ |
|
594 , [len256] "=a" (len256) \ |
|
595 , [tolen] "=a" (tolen) \ |
|
596 , [toaddr] "=a" (toaddr) \ |
|
597 , [target] "=a" (target) \ |
|
598 : [len] "r" (_len) /* inputs */ \ |
|
599 : "cc" /* clobbered */ \ |
|
600 ); \ |
|
601 } |
|
602 #else |
|
603 #define XC_MEMZERO_ANY(_to,_len) \ |
|
604 { unsigned long toaddr; unsigned long tolen; \ |
|
605 unsigned long len8; unsigned long len256; \ |
|
606 unsigned long target; unsigned long lenx; \ |
|
607 asm("\t" \ |
|
608 " LTGR %[tolen],%[len] \n\t" /* */ \ |
|
609 " BRC 8,2f \n\t" /* do nothing for l=0*/ \ |
|
610 " LG %[toaddr],%[to] \n\t" /* addr of data area */ \ |
|
611 " LARL %[target],1f \n\t" /* addr of XC instr */ \ |
|
612 " " \ |
|
613 " LCGR %[len8],%[toaddr] \n\t" /* HeapWord alignment */ \ |
|
614 " NILL %[len8],0x07 \n\t" \ |
|
615 " BRC 8,3f \n\t" /* already aligned */ \ |
|
616 " NILH %[len8],0x00 \n\t" /* zero extend */ \ |
|
617 " LLGFR %[len8],%[len8] \n\t" \ |
|
618 " LAY %[lenx],-1(,%[len8]) \n\t" \ |
|
619 " EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \ |
|
620 " LA %[toaddr],0(%[len8],%[toaddr]) \n\t" \ |
|
621 " SGR %[tolen],%[len8] \n\t" /* adjust len */ \ |
|
622 " " \ |
|
623 "3: LCGR %[len256],%[toaddr] \n\t" /* cache line alignment */\ |
|
624 " NILL %[len256],0xff \n\t" \ |
|
625 " BRC 8,4f \n\t" /* already aligned */ \ |
|
626 " NILH %[len256],0x00 \n\t" /* zero extend */ \ |
|
627 " LLGFR %[len256],%[len256] \n\t" \ |
|
628 " LAY %[lenx],-1(,%[len256]) \n\t" \ |
|
629 " EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \ |
|
630 " LA %[toaddr],0(%[len256],%[toaddr]) \n\t" \ |
|
631 " SGR %[tolen],%[len256] \n\t" /* adjust len */ \ |
|
632 " " \ |
|
633 "4: SRAG %[lenx],%[tolen],8 \n\t" /* # cache lines */ \ |
|
634 " BRC 8,6f \n\t" /* no full cache lines */ \ |
|
635 "5: XC 0(256,%[toaddr]),0(%[toaddr]) \n\t" \ |
|
636 " LA %[toaddr],256(,%[toaddr]) \n\t" \ |
|
637 " BRCTG %[lenx],5b \n\t" /* iterate */ \ |
|
638 " " \ |
|
639 "6: NILL %[tolen],0xff \n\t" /* leftover bytes */ \ |
|
640 " BRC 8,2f \n\t" /* done if none */ \ |
|
641 " LAY %[lenx],-1(,%[tolen]) \n\t" \ |
|
642 " EX %[lenx],0(%[target]) \n\t" /* execute MVC instr */ \ |
|
643 " BRC 15,2f \n\t" /* skip template */ \ |
|
644 " " \ |
|
645 "1: XC 0(1,%[toaddr]),0(%[toaddr]) \n\t" \ |
|
646 "2: BCR 0,0 \n\t" /* nop a branch target */ \ |
|
647 : [to] "+Q" (_to) /* outputs */ \ |
|
648 , [lenx] "=a" (lenx) \ |
|
649 , [len8] "=a" (len8) \ |
|
650 , [len256] "=a" (len256) \ |
|
651 , [tolen] "=a" (tolen) \ |
|
652 , [toaddr] "=a" (toaddr) \ |
|
653 , [target] "=a" (target) \ |
|
654 : [len] "r" (_len) /* inputs */ \ |
|
655 : "cc" /* clobbered */ \ |
|
656 ); \ |
|
657 } |
|
658 #endif |
|
659 #endif // USE_INLINE_ASM |
|
660 |
|
661 //*************************************// |
|
662 // D I S J O I N T C O P Y I N G // |
|
663 //*************************************// |
|
664 |
|
665 static void pd_aligned_disjoint_words(HeapWord* from, HeapWord* to, size_t count) { |
|
666 // JVM2008: very frequent, some tests frequent. |
|
667 |
|
668 // Copy HeapWord (=DW) aligned storage. Use MVCLE in inline-asm code. |
|
669 // MVCLE guarantees DW concurrent (i.e. atomic) accesses if both the addresses of the operands |
|
670 // are DW aligned and the length is an integer multiple of a DW. Should always be true here. |
|
671 // |
|
672 // No special exploit needed. H/W discovers suitable situations itself. |
|
673 // |
|
674 // For large chunks of memory, exploit special H/W support of z/Architecture: |
|
675 // 1) copy short piece of memory to page-align address(es) |
|
676 // 2) copy largest part (all contained full pages) of memory using mvcle instruction. |
|
677 // z/Architecture processors have special H/W support for page-aligned storage |
|
678 // where len is an int multiple of page size. In that case, up to 4 cache lines are |
|
679 // processed in parallel and L1 cache is not polluted. |
|
680 // 3) copy the remaining piece of memory. |
|
681 // |
|
682 #ifdef USE_INLINE_ASM |
|
683 jbyte* to_bytes = (jbyte*)to; |
|
684 jbyte* from_bytes = (jbyte*)from; |
|
685 size_t len_bytes = count*HeapWordSize; |
|
686 |
|
687 // Optimized copying for data less than 4k |
|
688 switch (count) { |
|
689 case 0: return; |
|
690 case 1: MOVE8_ATOMIC_1(to,from) |
|
691 return; |
|
692 case 2: MOVE8_ATOMIC_2(to,from) |
|
693 return; |
|
694 // case 3: MOVE8_ATOMIC_3(to,from) |
|
695 // return; |
|
696 // case 4: MOVE8_ATOMIC_4(to,from) |
|
697 // return; |
|
698 default: |
|
699 if (len_bytes <= 4096) { |
|
700 MVC_MULTI(to,from,len_bytes) |
|
701 return; |
|
702 } |
|
703 // else |
|
704 MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) |
|
705 return; |
|
706 } |
|
707 #else |
|
708 // Fallback code. |
|
709 switch (count) { |
|
710 case 0: |
|
711 return; |
|
712 |
|
713 case 1: |
|
714 *to = *from; |
|
715 return; |
|
716 |
|
717 case 2: |
|
718 *to++ = *from++; |
|
719 *to = *from; |
|
720 return; |
|
721 |
|
722 case 3: |
|
723 *to++ = *from++; |
|
724 *to++ = *from++; |
|
725 *to = *from; |
|
726 return; |
|
727 |
|
728 case 4: |
|
729 *to++ = *from++; |
|
730 *to++ = *from++; |
|
731 *to++ = *from++; |
|
732 *to = *from; |
|
733 return; |
|
734 |
|
735 default: |
|
736 while (count-- > 0) |
|
737 *(to++) = *(from++); |
|
738 return; |
|
739 } |
|
740 #endif |
|
741 } |
|
742 |
|
743 static void pd_disjoint_words_atomic(HeapWord* from, HeapWord* to, size_t count) { |
|
744 // JVM2008: < 4k calls. |
|
745 assert(((((size_t)from) & 0x07L) | (((size_t)to) & 0x07L)) == 0, "No atomic copy w/o aligned data"); |
|
746 pd_aligned_disjoint_words(from, to, count); // Rare calls -> just delegate. |
|
747 } |
|
748 |
|
749 static void pd_disjoint_words(HeapWord* from, HeapWord* to, size_t count) { |
|
750 // JVM2008: very rare. |
|
751 pd_aligned_disjoint_words(from, to, count); // Rare calls -> just delegate. |
|
752 } |
|
753 |
|
754 |
|
755 //*************************************// |
|
756 // C O N J O I N T C O P Y I N G // |
|
757 //*************************************// |
|
758 |
|
759 static void pd_aligned_conjoint_words(HeapWord* from, HeapWord* to, size_t count) { |
|
760 // JVM2008: between some and lower end of frequent. |
|
761 |
|
762 #ifdef USE_INLINE_ASM |
|
763 size_t count_in = count; |
|
764 if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) { |
|
765 switch (count_in) { |
|
766 case 4: COPY8_ATOMIC_4(to,from) |
|
767 return; |
|
768 case 3: COPY8_ATOMIC_3(to,from) |
|
769 return; |
|
770 case 2: COPY8_ATOMIC_2(to,from) |
|
771 return; |
|
772 case 1: COPY8_ATOMIC_1(to,from) |
|
773 return; |
|
774 case 0: return; |
|
775 default: |
|
776 from += count_in; |
|
777 to += count_in; |
|
778 while (count_in-- > 0) |
|
779 *(--to) = *(--from); // Copy backwards, areas overlap destructively. |
|
780 return; |
|
781 } |
|
782 } |
|
783 // else |
|
784 jbyte* to_bytes = (jbyte*)to; |
|
785 jbyte* from_bytes = (jbyte*)from; |
|
786 size_t len_bytes = count_in*BytesPerLong; |
|
787 MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) |
|
788 return; |
|
789 #else |
|
790 // Fallback code. |
|
791 if (has_destructive_overlap((char*)from, (char*)to, count*BytesPerLong)) { |
|
792 HeapWord t1, t2, t3; |
|
793 switch (count) { |
|
794 case 0: |
|
795 return; |
|
796 |
|
797 case 1: |
|
798 *to = *from; |
|
799 return; |
|
800 |
|
801 case 2: |
|
802 t1 = *(from+1); |
|
803 *to = *from; |
|
804 *(to+1) = t1; |
|
805 return; |
|
806 |
|
807 case 3: |
|
808 t1 = *(from+1); |
|
809 t2 = *(from+2); |
|
810 *to = *from; |
|
811 *(to+1) = t1; |
|
812 *(to+2) = t2; |
|
813 return; |
|
814 |
|
815 case 4: |
|
816 t1 = *(from+1); |
|
817 t2 = *(from+2); |
|
818 t3 = *(from+3); |
|
819 *to = *from; |
|
820 *(to+1) = t1; |
|
821 *(to+2) = t2; |
|
822 *(to+3) = t3; |
|
823 return; |
|
824 |
|
825 default: |
|
826 from += count; |
|
827 to += count; |
|
828 while (count-- > 0) |
|
829 *(--to) = *(--from); // Copy backwards, areas overlap destructively. |
|
830 return; |
|
831 } |
|
832 } |
|
833 // else |
|
834 // Just delegate. HeapWords are optimally aligned anyway. |
|
835 pd_aligned_disjoint_words(from, to, count); |
|
836 #endif |
|
837 } |
|
838 |
|
839 static void pd_conjoint_words(HeapWord* from, HeapWord* to, size_t count) { |
|
840 |
|
841 // Just delegate. HeapWords are optimally aligned anyway. |
|
842 pd_aligned_conjoint_words(from, to, count); |
|
843 } |
|
844 |
|
845 static void pd_conjoint_bytes(void* from, void* to, size_t count) { |
|
846 |
|
847 #ifdef USE_INLINE_ASM |
|
848 size_t count_in = count; |
|
849 if (has_destructive_overlap((char*)from, (char*)to, count_in)) |
|
850 (void)memmove(to, from, count_in); |
|
851 else { |
|
852 jbyte* to_bytes = (jbyte*)to; |
|
853 jbyte* from_bytes = (jbyte*)from; |
|
854 size_t len_bytes = count_in; |
|
855 MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) |
|
856 } |
|
857 #else |
|
858 if (has_destructive_overlap((char*)from, (char*)to, count)) |
|
859 (void)memmove(to, from, count); |
|
860 else |
|
861 (void)memcpy(to, from, count); |
|
862 #endif |
|
863 } |
|
864 |
|
865 //**************************************************// |
|
866 // C O N J O I N T A T O M I C C O P Y I N G // |
|
867 //**************************************************// |
|
868 |
|
869 static void pd_conjoint_bytes_atomic(void* from, void* to, size_t count) { |
|
870 // Call arraycopy stubs to do the job. |
|
871 pd_conjoint_bytes(from, to, count); // bytes are always accessed atomically. |
|
872 } |
|
873 |
|
874 static void pd_conjoint_jshorts_atomic(jshort* from, jshort* to, size_t count) { |
|
875 |
|
876 #ifdef USE_INLINE_ASM |
|
877 size_t count_in = count; |
|
878 if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerShort)) { |
|
879 // Use optimizations from shared code where no z-specific optimization exists. |
|
880 copy_conjoint_jshorts_atomic(from, to, count); |
|
881 } else { |
|
882 jbyte* to_bytes = (jbyte*)to; |
|
883 jbyte* from_bytes = (jbyte*)from; |
|
884 size_t len_bytes = count_in*BytesPerShort; |
|
885 MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) |
|
886 } |
|
887 #else |
|
888 // Use optimizations from shared code where no z-specific optimization exists. |
|
889 copy_conjoint_jshorts_atomic(from, to, count); |
|
890 #endif |
|
891 } |
|
892 |
|
893 static void pd_conjoint_jints_atomic(jint* from, jint* to, size_t count) { |
|
894 |
|
895 #ifdef USE_INLINE_ASM |
|
896 size_t count_in = count; |
|
897 if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerInt)) { |
|
898 switch (count_in) { |
|
899 case 4: COPY4_ATOMIC_4(to,from) |
|
900 return; |
|
901 case 3: COPY4_ATOMIC_3(to,from) |
|
902 return; |
|
903 case 2: COPY4_ATOMIC_2(to,from) |
|
904 return; |
|
905 case 1: COPY4_ATOMIC_1(to,from) |
|
906 return; |
|
907 case 0: return; |
|
908 default: |
|
909 // Use optimizations from shared code where no z-specific optimization exists. |
|
910 copy_conjoint_jints_atomic(from, to, count_in); |
|
911 return; |
|
912 } |
|
913 } |
|
914 // else |
|
915 jbyte* to_bytes = (jbyte*)to; |
|
916 jbyte* from_bytes = (jbyte*)from; |
|
917 size_t len_bytes = count_in*BytesPerInt; |
|
918 MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) |
|
919 #else |
|
920 // Use optimizations from shared code where no z-specific optimization exists. |
|
921 copy_conjoint_jints_atomic(from, to, count); |
|
922 #endif |
|
923 } |
|
924 |
|
925 static void pd_conjoint_jlongs_atomic(jlong* from, jlong* to, size_t count) { |
|
926 |
|
927 #ifdef USE_INLINE_ASM |
|
928 size_t count_in = count; |
|
929 if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) { |
|
930 switch (count_in) { |
|
931 case 4: COPY8_ATOMIC_4(to,from) return; |
|
932 case 3: COPY8_ATOMIC_3(to,from) return; |
|
933 case 2: COPY8_ATOMIC_2(to,from) return; |
|
934 case 1: COPY8_ATOMIC_1(to,from) return; |
|
935 case 0: return; |
|
936 default: |
|
937 from += count_in; |
|
938 to += count_in; |
|
939 while (count_in-- > 0) { *(--to) = *(--from); } // Copy backwards, areas overlap destructively. |
|
940 return; |
|
941 } |
|
942 } |
|
943 // else { |
|
944 jbyte* to_bytes = (jbyte*)to; |
|
945 jbyte* from_bytes = (jbyte*)from; |
|
946 size_t len_bytes = count_in*BytesPerLong; |
|
947 MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) |
|
948 #else |
|
949 size_t count_in = count; |
|
950 if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerLong)) { |
|
951 if (count_in < 8) { |
|
952 from += count_in; |
|
953 to += count_in; |
|
954 while (count_in-- > 0) |
|
955 *(--to) = *(--from); // Copy backwards, areas overlap destructively. |
|
956 return; |
|
957 } |
|
958 // else { |
|
959 from += count_in-1; |
|
960 to += count_in-1; |
|
961 if (count_in&0x01) { |
|
962 *(to--) = *(from--); |
|
963 count_in--; |
|
964 } |
|
965 for (; count_in>0; count_in-=2) { |
|
966 *to = *from; |
|
967 *(to-1) = *(from-1); |
|
968 to -= 2; |
|
969 from -= 2; |
|
970 } |
|
971 } |
|
972 else |
|
973 pd_aligned_disjoint_words((HeapWord*)from, (HeapWord*)to, count_in); // rare calls -> just delegate. |
|
974 #endif |
|
975 } |
|
976 |
|
977 static void pd_conjoint_oops_atomic(oop* from, oop* to, size_t count) { |
|
978 |
|
979 #ifdef USE_INLINE_ASM |
|
980 size_t count_in = count; |
|
981 if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerOop)) { |
|
982 switch (count_in) { |
|
983 case 4: COPY8_ATOMIC_4(to,from) return; |
|
984 case 3: COPY8_ATOMIC_3(to,from) return; |
|
985 case 2: COPY8_ATOMIC_2(to,from) return; |
|
986 case 1: COPY8_ATOMIC_1(to,from) return; |
|
987 case 0: return; |
|
988 default: |
|
989 from += count_in; |
|
990 to += count_in; |
|
991 while (count_in-- > 0) { *(--to) = *(--from); } // Copy backwards, areas overlap destructively. |
|
992 return; |
|
993 } |
|
994 } |
|
995 // else |
|
996 jbyte* to_bytes = (jbyte*)to; |
|
997 jbyte* from_bytes = (jbyte*)from; |
|
998 size_t len_bytes = count_in*BytesPerOop; |
|
999 MVCLE_MEMCOPY(to_bytes, from_bytes, len_bytes) |
|
1000 #else |
|
1001 size_t count_in = count; |
|
1002 if (has_destructive_overlap((char*)from, (char*)to, count_in*BytesPerOop)) { |
|
1003 from += count_in; |
|
1004 to += count_in; |
|
1005 while (count_in-- > 0) *(--to) = *(--from); // Copy backwards, areas overlap destructively. |
|
1006 return; |
|
1007 } |
|
1008 // else |
|
1009 pd_aligned_disjoint_words((HeapWord*)from, (HeapWord*)to, count_in); // rare calls -> just delegate. |
|
1010 return; |
|
1011 #endif |
|
1012 } |
|
1013 |
|
1014 static void pd_arrayof_conjoint_bytes(HeapWord* from, HeapWord* to, size_t count) { |
|
1015 pd_conjoint_bytes_atomic(from, to, count); |
|
1016 } |
|
1017 |
|
1018 static void pd_arrayof_conjoint_jshorts(HeapWord* from, HeapWord* to, size_t count) { |
|
1019 pd_conjoint_jshorts_atomic((jshort*)from, (jshort*)to, count); |
|
1020 } |
|
1021 |
|
1022 static void pd_arrayof_conjoint_jints(HeapWord* from, HeapWord* to, size_t count) { |
|
1023 pd_conjoint_jints_atomic((jint*)from, (jint*)to, count); |
|
1024 } |
|
1025 |
|
1026 static void pd_arrayof_conjoint_jlongs(HeapWord* from, HeapWord* to, size_t count) { |
|
1027 pd_conjoint_jlongs_atomic((jlong*)from, (jlong*)to, count); |
|
1028 } |
|
1029 |
|
1030 static void pd_arrayof_conjoint_oops(HeapWord* from, HeapWord* to, size_t count) { |
|
1031 pd_conjoint_oops_atomic((oop*)from, (oop*)to, count); |
|
1032 } |
|
1033 |
|
1034 //**********************************************// |
|
1035 // M E M O R Y I N I T I A L I S A T I O N // |
|
1036 //**********************************************// |
|
1037 |
|
1038 static void pd_fill_to_bytes(void* to, size_t count, jubyte value) { |
|
1039 // JVM2008: very rare, only in some tests. |
|
1040 #ifdef USE_INLINE_ASM |
|
1041 // Initialize storage to a given value. Use memset instead of copy loop. |
|
1042 // For large chunks of memory, exploit special H/W support of z/Architecture: |
|
1043 // 1) init short piece of memory to page-align address |
|
1044 // 2) init largest part (all contained full pages) of memory using mvcle instruction. |
|
1045 // z/Architecture processors have special H/W support for page-aligned storage |
|
1046 // where len is an int multiple of page size. In that case, up to 4 cache lines are |
|
1047 // processed in parallel and L1 cache is not polluted. |
|
1048 // 3) init the remaining piece of memory. |
|
1049 // Atomicity cannot really be an issue since gcc implements the loop body with XC anyway. |
|
1050 // If atomicity is a problem, we have to prevent gcc optimization. Best workaround: inline asm. |
|
1051 |
|
1052 jbyte* to_bytes = (jbyte*)to; |
|
1053 size_t len_bytes = count; |
|
1054 |
|
1055 MVCLE_MEMINIT(to_bytes, value, len_bytes) |
|
1056 |
|
1057 #else |
|
1058 // Memset does the best job possible: loop over 256-byte MVCs, with |
|
1059 // the last MVC EXecuted. With the -mmvcle option, initialization |
|
1060 // is done using MVCLE -> slight advantage for large areas. |
|
1061 (void)memset(to, value, count); |
|
1062 #endif |
|
1063 } |
|
1064 |
|
1065 static void pd_fill_to_words(HeapWord* tohw, size_t count, juint value) { |
|
1066 // Occurs in dbg builds only. Usually memory poisoning with BAADBABE, DEADBEEF, etc. |
|
1067 // JVM2008: < 4k calls. |
|
1068 if (value == 0) { |
|
1069 pd_zero_to_words(tohw, count); |
|
1070 return; |
|
1071 } |
|
1072 if (value == ~(juint)(0)) { |
|
1073 pd_fill_to_bytes(tohw, count*HeapWordSize, (jubyte)(~(juint)(0))); |
|
1074 return; |
|
1075 } |
|
1076 julong* to = (julong*) tohw; |
|
1077 julong v = ((julong) value << 32) | value; |
|
1078 while (count-- > 0) { |
|
1079 *to++ = v; |
|
1080 } |
|
1081 } |
|
1082 |
|
1083 static void pd_fill_to_aligned_words(HeapWord* tohw, size_t count, juint value) { |
|
1084 // JVM2008: very frequent, but virtually all calls are with value == 0. |
|
1085 pd_fill_to_words(tohw, count, value); |
|
1086 } |
|
1087 |
|
1088 //**********************************// |
|
1089 // M E M O R Y C L E A R I N G // |
|
1090 //**********************************// |
|
1091 |
|
1092 // Delegate to pd_zero_to_bytes. It also works HeapWord-atomic. |
|
1093 // Distinguish between simple and large zero_to_words. |
|
1094 static void pd_zero_to_words(HeapWord* tohw, size_t count) { |
|
1095 pd_zero_to_bytes(tohw, count*HeapWordSize); |
|
1096 } |
|
1097 |
|
1098 // Delegate to pd_zero_to_bytes. It also works HeapWord-atomic. |
|
1099 static void pd_zero_to_words_large(HeapWord* tohw, size_t count) { |
|
1100 // JVM2008: generally frequent, some tests show very frequent calls. |
|
1101 pd_zero_to_bytes(tohw, count*HeapWordSize); |
|
1102 } |
|
1103 |
|
1104 static void pd_zero_to_bytes(void* to, size_t count) { |
|
1105 // JVM2008: some calls (generally), some tests frequent |
|
1106 #ifdef USE_INLINE_ASM |
|
1107 // Even zero_to_bytes() requires HeapWord-atomic, or, at least, sequential |
|
1108 // zeroing of the memory. MVCLE is not fit for that job: |
|
1109 // "As observed by other CPUs and by the channel subsystem, |
|
1110 // that portion of the first operand which is filled |
|
1111 // with the padding byte is not necessarily stored into in |
|
1112 // a left-to-right direction and may appear to be stored |
|
1113 // into more than once." |
|
1114 // Therefore, implementation was changed to use (multiple) XC instructions. |
|
1115 |
|
1116 const long line_size = 256; |
|
1117 jbyte* to_bytes = (jbyte*)to; |
|
1118 size_t len_bytes = count; |
|
1119 |
|
1120 if (len_bytes <= line_size) { |
|
1121 XC_MEMZERO_256(to_bytes, len_bytes); |
|
1122 } else { |
|
1123 XC_MEMZERO_ANY(to_bytes, len_bytes); |
|
1124 } |
|
1125 |
|
1126 #else |
|
1127 // Memset does the best job possible: loop over 256-byte MVCs, with |
|
1128 // the last MVC EXecuted. With the -mmvcle option, initialization |
|
1129 // is done using MVCLE -> slight advantage for large areas. |
|
1130 (void)memset(to, 0, count); |
|
1131 #endif |
|
1132 } |
|
1133 |
|
1134 #endif // CPU_S390_VM_COPY_S390_HPP |