1286 Label L_loop; |
1286 Label L_loop; |
1287 __ align(OptoLoopAlignment); |
1287 __ align(OptoLoopAlignment); |
1288 if (UseUnalignedLoadStores) { |
1288 if (UseUnalignedLoadStores) { |
1289 Label L_end; |
1289 Label L_end; |
1290 // Copy 64-bytes per iteration |
1290 // Copy 64-bytes per iteration |
1291 __ BIND(L_loop); |
|
1292 if (UseAVX > 2) { |
1291 if (UseAVX > 2) { |
|
1292 Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold; |
|
1293 |
|
1294 __ BIND(L_copy_bytes); |
|
1295 __ cmpptr(qword_count, (-1 * AVX3Threshold / 8)); |
|
1296 __ jccb(Assembler::less, L_above_threshold); |
|
1297 __ jmpb(L_below_threshold); |
|
1298 |
|
1299 __ bind(L_loop_avx512); |
1293 __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit); |
1300 __ evmovdqul(xmm0, Address(end_from, qword_count, Address::times_8, -56), Assembler::AVX_512bit); |
1294 __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit); |
1301 __ evmovdqul(Address(end_to, qword_count, Address::times_8, -56), xmm0, Assembler::AVX_512bit); |
1295 } else if (UseAVX == 2) { |
1302 __ bind(L_above_threshold); |
|
1303 __ addptr(qword_count, 8); |
|
1304 __ jcc(Assembler::lessEqual, L_loop_avx512); |
|
1305 __ jmpb(L_32_byte_head); |
|
1306 |
|
1307 __ bind(L_loop_avx2); |
1296 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); |
1308 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); |
1297 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); |
1309 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); |
1298 __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24)); |
1310 __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24)); |
1299 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1); |
1311 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1); |
|
1312 __ bind(L_below_threshold); |
|
1313 __ addptr(qword_count, 8); |
|
1314 __ jcc(Assembler::lessEqual, L_loop_avx2); |
|
1315 |
|
1316 __ bind(L_32_byte_head); |
|
1317 __ subptr(qword_count, 4); // sub(8) and add(4) |
|
1318 __ jccb(Assembler::greater, L_end); |
1300 } else { |
1319 } else { |
1301 __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); |
1320 __ BIND(L_loop); |
1302 __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); |
1321 if (UseAVX == 2) { |
1303 __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40)); |
1322 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); |
1304 __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1); |
1323 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); |
1305 __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24)); |
1324 __ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24)); |
1306 __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2); |
1325 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1); |
1307 __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8)); |
1326 } else { |
1308 __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3); |
1327 __ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56)); |
|
1328 __ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0); |
|
1329 __ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40)); |
|
1330 __ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1); |
|
1331 __ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24)); |
|
1332 __ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2); |
|
1333 __ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8)); |
|
1334 __ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3); |
|
1335 } |
|
1336 |
|
1337 __ BIND(L_copy_bytes); |
|
1338 __ addptr(qword_count, 8); |
|
1339 __ jcc(Assembler::lessEqual, L_loop); |
|
1340 __ subptr(qword_count, 4); // sub(8) and add(4) |
|
1341 __ jccb(Assembler::greater, L_end); |
1309 } |
1342 } |
1310 __ BIND(L_copy_bytes); |
|
1311 __ addptr(qword_count, 8); |
|
1312 __ jcc(Assembler::lessEqual, L_loop); |
|
1313 __ subptr(qword_count, 4); // sub(8) and add(4) |
|
1314 __ jccb(Assembler::greater, L_end); |
|
1315 // Copy trailing 32 bytes |
1343 // Copy trailing 32 bytes |
1316 if (UseAVX >= 2) { |
1344 if (UseAVX >= 2) { |
1317 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); |
1345 __ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24)); |
1318 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0); |
1346 __ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0); |
1319 } else { |
1347 } else { |
1366 Label L_loop; |
1394 Label L_loop; |
1367 __ align(OptoLoopAlignment); |
1395 __ align(OptoLoopAlignment); |
1368 if (UseUnalignedLoadStores) { |
1396 if (UseUnalignedLoadStores) { |
1369 Label L_end; |
1397 Label L_end; |
1370 // Copy 64-bytes per iteration |
1398 // Copy 64-bytes per iteration |
1371 __ BIND(L_loop); |
|
1372 if (UseAVX > 2) { |
1399 if (UseAVX > 2) { |
|
1400 Label L_loop_avx512, L_loop_avx2, L_32_byte_head, L_above_threshold, L_below_threshold; |
|
1401 |
|
1402 __ BIND(L_copy_bytes); |
|
1403 __ cmpptr(qword_count, (AVX3Threshold / 8)); |
|
1404 __ jccb(Assembler::greater, L_above_threshold); |
|
1405 __ jmpb(L_below_threshold); |
|
1406 |
|
1407 __ BIND(L_loop_avx512); |
1373 __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit); |
1408 __ evmovdqul(xmm0, Address(from, qword_count, Address::times_8, 0), Assembler::AVX_512bit); |
1374 __ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit); |
1409 __ evmovdqul(Address(dest, qword_count, Address::times_8, 0), xmm0, Assembler::AVX_512bit); |
1375 } else if (UseAVX == 2) { |
1410 __ bind(L_above_threshold); |
|
1411 __ subptr(qword_count, 8); |
|
1412 __ jcc(Assembler::greaterEqual, L_loop_avx512); |
|
1413 __ jmpb(L_32_byte_head); |
|
1414 |
|
1415 __ bind(L_loop_avx2); |
1376 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32)); |
1416 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32)); |
1377 __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0); |
1417 __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0); |
1378 __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); |
1418 __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); |
1379 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); |
1419 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); |
|
1420 __ bind(L_below_threshold); |
|
1421 __ subptr(qword_count, 8); |
|
1422 __ jcc(Assembler::greaterEqual, L_loop_avx2); |
|
1423 |
|
1424 __ bind(L_32_byte_head); |
|
1425 __ addptr(qword_count, 4); // add(8) and sub(4) |
|
1426 __ jccb(Assembler::less, L_end); |
1380 } else { |
1427 } else { |
1381 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48)); |
1428 __ BIND(L_loop); |
1382 __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0); |
1429 if (UseAVX == 2) { |
1383 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32)); |
1430 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32)); |
1384 __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1); |
1431 __ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0); |
1385 __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16)); |
1432 __ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0)); |
1386 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2); |
1433 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1); |
1387 __ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0)); |
1434 } else { |
1388 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3); |
1435 __ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48)); |
|
1436 __ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0); |
|
1437 __ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32)); |
|
1438 __ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1); |
|
1439 __ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16)); |
|
1440 __ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2); |
|
1441 __ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0)); |
|
1442 __ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3); |
|
1443 } |
|
1444 |
|
1445 __ BIND(L_copy_bytes); |
|
1446 __ subptr(qword_count, 8); |
|
1447 __ jcc(Assembler::greaterEqual, L_loop); |
|
1448 |
|
1449 __ addptr(qword_count, 4); // add(8) and sub(4) |
|
1450 __ jccb(Assembler::less, L_end); |
1389 } |
1451 } |
1390 __ BIND(L_copy_bytes); |
|
1391 __ subptr(qword_count, 8); |
|
1392 __ jcc(Assembler::greaterEqual, L_loop); |
|
1393 |
|
1394 __ addptr(qword_count, 4); // add(8) and sub(4) |
|
1395 __ jccb(Assembler::less, L_end); |
|
1396 // Copy trailing 32 bytes |
1452 // Copy trailing 32 bytes |
1397 if (UseAVX >= 2) { |
1453 if (UseAVX >= 2) { |
1398 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0)); |
1454 __ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0)); |
1399 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0); |
1455 __ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0); |
1400 } else { |
1456 } else { |