987 |
1001 |
988 VectorSRegister tmp_vsr1 = VSR1; |
1002 VectorSRegister tmp_vsr1 = VSR1; |
989 VectorSRegister tmp_vsr2 = VSR2; |
1003 VectorSRegister tmp_vsr2 = VSR2; |
990 |
1004 |
991 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10; |
1005 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10; |
992 |
1006 { |
993 // Don't try anything fancy if arrays don't have many elements. |
1007 // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit |
994 __ li(tmp3, 0); |
1008 UnsafeCopyMemoryMark ucmm(this, !aligned, false); |
995 __ cmpwi(CCR0, R5_ARG3, 17); |
1009 |
996 __ ble(CCR0, l_6); // copy 4 at a time |
1010 // Don't try anything fancy if arrays don't have many elements. |
997 |
1011 __ li(tmp3, 0); |
998 if (!aligned) { |
1012 __ cmpwi(CCR0, R5_ARG3, 17); |
999 __ xorr(tmp1, R3_ARG1, R4_ARG2); |
1013 __ ble(CCR0, l_6); // copy 4 at a time |
1000 __ andi_(tmp1, tmp1, 3); |
1014 |
1001 __ bne(CCR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy. |
1015 if (!aligned) { |
1002 |
1016 __ xorr(tmp1, R3_ARG1, R4_ARG2); |
1003 // Copy elements if necessary to align to 4 bytes. |
1017 __ andi_(tmp1, tmp1, 3); |
1004 __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary. |
1018 __ bne(CCR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy. |
1005 __ andi_(tmp1, tmp1, 3); |
1019 |
1006 __ beq(CCR0, l_2); |
1020 // Copy elements if necessary to align to 4 bytes. |
1007 |
1021 __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary. |
1008 __ subf(R5_ARG3, tmp1, R5_ARG3); |
1022 __ andi_(tmp1, tmp1, 3); |
1009 __ bind(l_9); |
1023 __ beq(CCR0, l_2); |
1010 __ lbz(tmp2, 0, R3_ARG1); |
1024 |
1011 __ addic_(tmp1, tmp1, -1); |
1025 __ subf(R5_ARG3, tmp1, R5_ARG3); |
1012 __ stb(tmp2, 0, R4_ARG2); |
1026 __ bind(l_9); |
1013 __ addi(R3_ARG1, R3_ARG1, 1); |
1027 __ lbz(tmp2, 0, R3_ARG1); |
1014 __ addi(R4_ARG2, R4_ARG2, 1); |
1028 __ addic_(tmp1, tmp1, -1); |
1015 __ bne(CCR0, l_9); |
1029 __ stb(tmp2, 0, R4_ARG2); |
1016 |
1030 __ addi(R3_ARG1, R3_ARG1, 1); |
1017 __ bind(l_2); |
1031 __ addi(R4_ARG2, R4_ARG2, 1); |
1018 } |
1032 __ bne(CCR0, l_9); |
1019 |
1033 |
1020 // copy 8 elements at a time |
1034 __ bind(l_2); |
1021 __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8 |
|
1022 __ andi_(tmp1, tmp2, 7); |
|
1023 __ bne(CCR0, l_7); // not same alignment -> to or from is aligned -> copy 8 |
|
1024 |
|
1025 // copy a 2-element word if necessary to align to 8 bytes |
|
1026 __ andi_(R0, R3_ARG1, 7); |
|
1027 __ beq(CCR0, l_7); |
|
1028 |
|
1029 __ lwzx(tmp2, R3_ARG1, tmp3); |
|
1030 __ addi(R5_ARG3, R5_ARG3, -4); |
|
1031 __ stwx(tmp2, R4_ARG2, tmp3); |
|
1032 { // FasterArrayCopy |
|
1033 __ addi(R3_ARG1, R3_ARG1, 4); |
|
1034 __ addi(R4_ARG2, R4_ARG2, 4); |
|
1035 } |
|
1036 __ bind(l_7); |
|
1037 |
|
1038 { // FasterArrayCopy |
|
1039 __ cmpwi(CCR0, R5_ARG3, 31); |
|
1040 __ ble(CCR0, l_6); // copy 2 at a time if less than 32 elements remain |
|
1041 |
|
1042 __ srdi(tmp1, R5_ARG3, 5); |
|
1043 __ andi_(R5_ARG3, R5_ARG3, 31); |
|
1044 __ mtctr(tmp1); |
|
1045 |
|
1046 if (!VM_Version::has_vsx()) { |
|
1047 |
|
1048 __ bind(l_8); |
|
1049 // Use unrolled version for mass copying (copy 32 elements a time) |
|
1050 // Load feeding store gets zero latency on Power6, however not on Power5. |
|
1051 // Therefore, the following sequence is made for the good of both. |
|
1052 __ ld(tmp1, 0, R3_ARG1); |
|
1053 __ ld(tmp2, 8, R3_ARG1); |
|
1054 __ ld(tmp3, 16, R3_ARG1); |
|
1055 __ ld(tmp4, 24, R3_ARG1); |
|
1056 __ std(tmp1, 0, R4_ARG2); |
|
1057 __ std(tmp2, 8, R4_ARG2); |
|
1058 __ std(tmp3, 16, R4_ARG2); |
|
1059 __ std(tmp4, 24, R4_ARG2); |
|
1060 __ addi(R3_ARG1, R3_ARG1, 32); |
|
1061 __ addi(R4_ARG2, R4_ARG2, 32); |
|
1062 __ bdnz(l_8); |
|
1063 |
|
1064 } else { // Processor supports VSX, so use it to mass copy. |
|
1065 |
|
1066 // Prefetch the data into the L2 cache. |
|
1067 __ dcbt(R3_ARG1, 0); |
|
1068 |
|
1069 // If supported set DSCR pre-fetch to deepest. |
|
1070 if (VM_Version::has_mfdscr()) { |
|
1071 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); |
|
1072 __ mtdscr(tmp2); |
|
1073 } |
1035 } |
1074 |
1036 |
1075 __ li(tmp1, 16); |
1037 // copy 8 elements at a time |
1076 |
1038 __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8 |
1077 // Backbranch target aligned to 32-byte. Not 16-byte align as |
1039 __ andi_(tmp1, tmp2, 7); |
1078 // loop contains < 8 instructions that fit inside a single |
1040 __ bne(CCR0, l_7); // not same alignment -> to or from is aligned -> copy 8 |
1079 // i-cache sector. |
1041 |
1080 __ align(32); |
1042 // copy a 2-element word if necessary to align to 8 bytes |
1081 |
1043 __ andi_(R0, R3_ARG1, 7); |
1082 __ bind(l_10); |
1044 __ beq(CCR0, l_7); |
1083 // Use loop with VSX load/store instructions to |
1045 |
1084 // copy 32 elements a time. |
1046 __ lwzx(tmp2, R3_ARG1, tmp3); |
1085 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src |
1047 __ addi(R5_ARG3, R5_ARG3, -4); |
1086 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst |
1048 __ stwx(tmp2, R4_ARG2, tmp3); |
1087 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 |
1049 { // FasterArrayCopy |
1088 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 |
1050 __ addi(R3_ARG1, R3_ARG1, 4); |
1089 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 |
1051 __ addi(R4_ARG2, R4_ARG2, 4); |
1090 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 |
|
1091 __ bdnz(l_10); // Dec CTR and loop if not zero. |
|
1092 |
|
1093 // Restore DSCR pre-fetch value. |
|
1094 if (VM_Version::has_mfdscr()) { |
|
1095 __ load_const_optimized(tmp2, VM_Version::_dscr_val); |
|
1096 __ mtdscr(tmp2); |
|
1097 } |
1052 } |
1098 |
1053 __ bind(l_7); |
1099 } // VSX |
1054 |
1100 } // FasterArrayCopy |
1055 { // FasterArrayCopy |
1101 |
1056 __ cmpwi(CCR0, R5_ARG3, 31); |
1102 __ bind(l_6); |
1057 __ ble(CCR0, l_6); // copy 2 at a time if less than 32 elements remain |
1103 |
1058 |
1104 // copy 4 elements at a time |
1059 __ srdi(tmp1, R5_ARG3, 5); |
1105 __ cmpwi(CCR0, R5_ARG3, 4); |
1060 __ andi_(R5_ARG3, R5_ARG3, 31); |
1106 __ blt(CCR0, l_1); |
1061 __ mtctr(tmp1); |
1107 __ srdi(tmp1, R5_ARG3, 2); |
1062 |
1108 __ mtctr(tmp1); // is > 0 |
1063 if (!VM_Version::has_vsx()) { |
1109 __ andi_(R5_ARG3, R5_ARG3, 3); |
1064 |
1110 |
1065 __ bind(l_8); |
1111 { // FasterArrayCopy |
1066 // Use unrolled version for mass copying (copy 32 elements a time) |
1112 __ addi(R3_ARG1, R3_ARG1, -4); |
1067 // Load feeding store gets zero latency on Power6, however not on Power5. |
1113 __ addi(R4_ARG2, R4_ARG2, -4); |
1068 // Therefore, the following sequence is made for the good of both. |
1114 __ bind(l_3); |
1069 __ ld(tmp1, 0, R3_ARG1); |
1115 __ lwzu(tmp2, 4, R3_ARG1); |
1070 __ ld(tmp2, 8, R3_ARG1); |
1116 __ stwu(tmp2, 4, R4_ARG2); |
1071 __ ld(tmp3, 16, R3_ARG1); |
1117 __ bdnz(l_3); |
1072 __ ld(tmp4, 24, R3_ARG1); |
1118 __ addi(R3_ARG1, R3_ARG1, 4); |
1073 __ std(tmp1, 0, R4_ARG2); |
1119 __ addi(R4_ARG2, R4_ARG2, 4); |
1074 __ std(tmp2, 8, R4_ARG2); |
1120 } |
1075 __ std(tmp3, 16, R4_ARG2); |
1121 |
1076 __ std(tmp4, 24, R4_ARG2); |
1122 // do single element copy |
1077 __ addi(R3_ARG1, R3_ARG1, 32); |
1123 __ bind(l_1); |
1078 __ addi(R4_ARG2, R4_ARG2, 32); |
1124 __ cmpwi(CCR0, R5_ARG3, 0); |
1079 __ bdnz(l_8); |
1125 __ beq(CCR0, l_4); |
1080 |
1126 |
1081 } else { // Processor supports VSX, so use it to mass copy. |
1127 { // FasterArrayCopy |
1082 |
1128 __ mtctr(R5_ARG3); |
1083 // Prefetch the data into the L2 cache. |
1129 __ addi(R3_ARG1, R3_ARG1, -1); |
1084 __ dcbt(R3_ARG1, 0); |
1130 __ addi(R4_ARG2, R4_ARG2, -1); |
1085 |
1131 |
1086 // If supported set DSCR pre-fetch to deepest. |
1132 __ bind(l_5); |
1087 if (VM_Version::has_mfdscr()) { |
1133 __ lbzu(tmp2, 1, R3_ARG1); |
1088 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); |
1134 __ stbu(tmp2, 1, R4_ARG2); |
1089 __ mtdscr(tmp2); |
1135 __ bdnz(l_5); |
1090 } |
|
1091 |
|
1092 __ li(tmp1, 16); |
|
1093 |
|
1094 // Backbranch target aligned to 32-byte. Not 16-byte align as |
|
1095 // loop contains < 8 instructions that fit inside a single |
|
1096 // i-cache sector. |
|
1097 __ align(32); |
|
1098 |
|
1099 __ bind(l_10); |
|
1100 // Use loop with VSX load/store instructions to |
|
1101 // copy 32 elements a time. |
|
1102 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load src |
|
1103 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst |
|
1104 __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1); // Load src + 16 |
|
1105 __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16 |
|
1106 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32 |
|
1107 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32 |
|
1108 __ bdnz(l_10); // Dec CTR and loop if not zero. |
|
1109 |
|
1110 // Restore DSCR pre-fetch value. |
|
1111 if (VM_Version::has_mfdscr()) { |
|
1112 __ load_const_optimized(tmp2, VM_Version::_dscr_val); |
|
1113 __ mtdscr(tmp2); |
|
1114 } |
|
1115 |
|
1116 } // VSX |
|
1117 } // FasterArrayCopy |
|
1118 |
|
1119 __ bind(l_6); |
|
1120 |
|
1121 // copy 4 elements at a time |
|
1122 __ cmpwi(CCR0, R5_ARG3, 4); |
|
1123 __ blt(CCR0, l_1); |
|
1124 __ srdi(tmp1, R5_ARG3, 2); |
|
1125 __ mtctr(tmp1); // is > 0 |
|
1126 __ andi_(R5_ARG3, R5_ARG3, 3); |
|
1127 |
|
1128 { // FasterArrayCopy |
|
1129 __ addi(R3_ARG1, R3_ARG1, -4); |
|
1130 __ addi(R4_ARG2, R4_ARG2, -4); |
|
1131 __ bind(l_3); |
|
1132 __ lwzu(tmp2, 4, R3_ARG1); |
|
1133 __ stwu(tmp2, 4, R4_ARG2); |
|
1134 __ bdnz(l_3); |
|
1135 __ addi(R3_ARG1, R3_ARG1, 4); |
|
1136 __ addi(R4_ARG2, R4_ARG2, 4); |
|
1137 } |
|
1138 |
|
1139 // do single element copy |
|
1140 __ bind(l_1); |
|
1141 __ cmpwi(CCR0, R5_ARG3, 0); |
|
1142 __ beq(CCR0, l_4); |
|
1143 |
|
1144 { // FasterArrayCopy |
|
1145 __ mtctr(R5_ARG3); |
|
1146 __ addi(R3_ARG1, R3_ARG1, -1); |
|
1147 __ addi(R4_ARG2, R4_ARG2, -1); |
|
1148 |
|
1149 __ bind(l_5); |
|
1150 __ lbzu(tmp2, 1, R3_ARG1); |
|
1151 __ stbu(tmp2, 1, R4_ARG2); |
|
1152 __ bdnz(l_5); |
|
1153 } |
1136 } |
1154 } |
1137 |
1155 |
1138 __ bind(l_4); |
1156 __ bind(l_4); |
1139 __ li(R3_RET, 0); // return 0 |
1157 __ li(R3_RET, 0); // return 0 |
1140 __ blr(); |
1158 __ blr(); |
1250 |
1270 |
1251 address start = __ function_entry(); |
1271 address start = __ function_entry(); |
1252 assert_positive_int(R5_ARG3); |
1272 assert_positive_int(R5_ARG3); |
1253 |
1273 |
1254 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9; |
1274 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9; |
1255 |
1275 { |
1256 // don't try anything fancy if arrays don't have many elements |
1276 // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit |
1257 __ li(tmp3, 0); |
1277 UnsafeCopyMemoryMark ucmm(this, !aligned, false); |
1258 __ cmpwi(CCR0, R5_ARG3, 9); |
1278 // don't try anything fancy if arrays don't have many elements |
1259 __ ble(CCR0, l_6); // copy 2 at a time |
1279 __ li(tmp3, 0); |
1260 |
1280 __ cmpwi(CCR0, R5_ARG3, 9); |
1261 if (!aligned) { |
1281 __ ble(CCR0, l_6); // copy 2 at a time |
1262 __ xorr(tmp1, R3_ARG1, R4_ARG2); |
1282 |
1263 __ andi_(tmp1, tmp1, 3); |
1283 if (!aligned) { |
1264 __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy |
1284 __ xorr(tmp1, R3_ARG1, R4_ARG2); |
1265 |
1285 __ andi_(tmp1, tmp1, 3); |
1266 // At this point it is guaranteed that both, from and to have the same alignment mod 4. |
1286 __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy |
1267 |
1287 |
1268 // Copy 1 element if necessary to align to 4 bytes. |
1288 // At this point it is guaranteed that both, from and to have the same alignment mod 4. |
1269 __ andi_(tmp1, R3_ARG1, 3); |
1289 |
1270 __ beq(CCR0, l_2); |
1290 // Copy 1 element if necessary to align to 4 bytes. |
1271 |
1291 __ andi_(tmp1, R3_ARG1, 3); |
1272 __ lhz(tmp2, 0, R3_ARG1); |
1292 __ beq(CCR0, l_2); |
1273 __ addi(R3_ARG1, R3_ARG1, 2); |
1293 |
1274 __ sth(tmp2, 0, R4_ARG2); |
1294 __ lhz(tmp2, 0, R3_ARG1); |
1275 __ addi(R4_ARG2, R4_ARG2, 2); |
1295 __ addi(R3_ARG1, R3_ARG1, 2); |
1276 __ addi(R5_ARG3, R5_ARG3, -1); |
1296 __ sth(tmp2, 0, R4_ARG2); |
1277 __ bind(l_2); |
1297 __ addi(R4_ARG2, R4_ARG2, 2); |
1278 |
1298 __ addi(R5_ARG3, R5_ARG3, -1); |
1279 // At this point the positions of both, from and to, are at least 4 byte aligned. |
1299 __ bind(l_2); |
1280 |
1300 |
1281 // Copy 4 elements at a time. |
1301 // At this point the positions of both, from and to, are at least 4 byte aligned. |
1282 // Align to 8 bytes, but only if both, from and to, have same alignment mod 8. |
1302 |
1283 __ xorr(tmp2, R3_ARG1, R4_ARG2); |
1303 // Copy 4 elements at a time. |
1284 __ andi_(tmp1, tmp2, 7); |
1304 // Align to 8 bytes, but only if both, from and to, have same alignment mod 8. |
1285 __ bne(CCR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned |
1305 __ xorr(tmp2, R3_ARG1, R4_ARG2); |
1286 |
1306 __ andi_(tmp1, tmp2, 7); |
1287 // Copy a 2-element word if necessary to align to 8 bytes. |
1307 __ bne(CCR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned |
1288 __ andi_(R0, R3_ARG1, 7); |
1308 |
1289 __ beq(CCR0, l_7); |
1309 // Copy a 2-element word if necessary to align to 8 bytes. |
1290 |
1310 __ andi_(R0, R3_ARG1, 7); |
1291 __ lwzx(tmp2, R3_ARG1, tmp3); |
1311 __ beq(CCR0, l_7); |
1292 __ addi(R5_ARG3, R5_ARG3, -2); |
1312 |
1293 __ stwx(tmp2, R4_ARG2, tmp3); |
1313 __ lwzx(tmp2, R3_ARG1, tmp3); |
|
1314 __ addi(R5_ARG3, R5_ARG3, -2); |
|
1315 __ stwx(tmp2, R4_ARG2, tmp3); |
|
1316 { // FasterArrayCopy |
|
1317 __ addi(R3_ARG1, R3_ARG1, 4); |
|
1318 __ addi(R4_ARG2, R4_ARG2, 4); |
|
1319 } |
|
1320 } |
|
1321 |
|
1322 __ bind(l_7); |
|
1323 |
|
1324 // Copy 4 elements at a time; either the loads or the stores can |
|
1325 // be unaligned if aligned == false. |
|
1326 |
1294 { // FasterArrayCopy |
1327 { // FasterArrayCopy |
|
1328 __ cmpwi(CCR0, R5_ARG3, 15); |
|
1329 __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain |
|
1330 |
|
1331 __ srdi(tmp1, R5_ARG3, 4); |
|
1332 __ andi_(R5_ARG3, R5_ARG3, 15); |
|
1333 __ mtctr(tmp1); |
|
1334 |
|
1335 if (!VM_Version::has_vsx()) { |
|
1336 |
|
1337 __ bind(l_8); |
|
1338 // Use unrolled version for mass copying (copy 16 elements a time). |
|
1339 // Load feeding store gets zero latency on Power6, however not on Power5. |
|
1340 // Therefore, the following sequence is made for the good of both. |
|
1341 __ ld(tmp1, 0, R3_ARG1); |
|
1342 __ ld(tmp2, 8, R3_ARG1); |
|
1343 __ ld(tmp3, 16, R3_ARG1); |
|
1344 __ ld(tmp4, 24, R3_ARG1); |
|
1345 __ std(tmp1, 0, R4_ARG2); |
|
1346 __ std(tmp2, 8, R4_ARG2); |
|
1347 __ std(tmp3, 16, R4_ARG2); |
|
1348 __ std(tmp4, 24, R4_ARG2); |
|
1349 __ addi(R3_ARG1, R3_ARG1, 32); |
|
1350 __ addi(R4_ARG2, R4_ARG2, 32); |
|
1351 __ bdnz(l_8); |
|
1352 |
|
1353 } else { // Processor supports VSX, so use it to mass copy. |
|
1354 |
|
1355 // Prefetch src data into L2 cache. |
|
1356 __ dcbt(R3_ARG1, 0); |
|
1357 |
|
1358 // If supported set DSCR pre-fetch to deepest. |
|
1359 if (VM_Version::has_mfdscr()) { |
|
1360 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); |
|
1361 __ mtdscr(tmp2); |
|
1362 } |
|
1363 __ li(tmp1, 16); |
|
1364 |
|
1365 // Backbranch target aligned to 32-byte. It's not aligned 16-byte |
|
1366 // as loop contains < 8 instructions that fit inside a single |
|
1367 // i-cache sector. |
|
1368 __ align(32); |
|
1369 |
|
1370 __ bind(l_9); |
|
1371 // Use loop with VSX load/store instructions to |
|
1372 // copy 16 elements a time. |
|
1373 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load from src. |
|
1374 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst. |
|
1375 __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16. |
|
1376 __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16. |
|
1377 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32. |
|
1378 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32. |
|
1379 __ bdnz(l_9); // Dec CTR and loop if not zero. |
|
1380 |
|
1381 // Restore DSCR pre-fetch value. |
|
1382 if (VM_Version::has_mfdscr()) { |
|
1383 __ load_const_optimized(tmp2, VM_Version::_dscr_val); |
|
1384 __ mtdscr(tmp2); |
|
1385 } |
|
1386 |
|
1387 } |
|
1388 } // FasterArrayCopy |
|
1389 __ bind(l_6); |
|
1390 |
|
1391 // copy 2 elements at a time |
|
1392 { // FasterArrayCopy |
|
1393 __ cmpwi(CCR0, R5_ARG3, 2); |
|
1394 __ blt(CCR0, l_1); |
|
1395 __ srdi(tmp1, R5_ARG3, 1); |
|
1396 __ andi_(R5_ARG3, R5_ARG3, 1); |
|
1397 |
|
1398 __ addi(R3_ARG1, R3_ARG1, -4); |
|
1399 __ addi(R4_ARG2, R4_ARG2, -4); |
|
1400 __ mtctr(tmp1); |
|
1401 |
|
1402 __ bind(l_3); |
|
1403 __ lwzu(tmp2, 4, R3_ARG1); |
|
1404 __ stwu(tmp2, 4, R4_ARG2); |
|
1405 __ bdnz(l_3); |
|
1406 |
1295 __ addi(R3_ARG1, R3_ARG1, 4); |
1407 __ addi(R3_ARG1, R3_ARG1, 4); |
1296 __ addi(R4_ARG2, R4_ARG2, 4); |
1408 __ addi(R4_ARG2, R4_ARG2, 4); |
1297 } |
1409 } |
1298 } |
1410 |
1299 |
1411 // do single element copy |
1300 __ bind(l_7); |
1412 __ bind(l_1); |
1301 |
1413 __ cmpwi(CCR0, R5_ARG3, 0); |
1302 // Copy 4 elements at a time; either the loads or the stores can |
1414 __ beq(CCR0, l_4); |
1303 // be unaligned if aligned == false. |
1415 |
1304 |
1416 { // FasterArrayCopy |
1305 { // FasterArrayCopy |
1417 __ mtctr(R5_ARG3); |
1306 __ cmpwi(CCR0, R5_ARG3, 15); |
1418 __ addi(R3_ARG1, R3_ARG1, -2); |
1307 __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain |
1419 __ addi(R4_ARG2, R4_ARG2, -2); |
1308 |
1420 |
1309 __ srdi(tmp1, R5_ARG3, 4); |
1421 __ bind(l_5); |
1310 __ andi_(R5_ARG3, R5_ARG3, 15); |
1422 __ lhzu(tmp2, 2, R3_ARG1); |
1311 __ mtctr(tmp1); |
1423 __ sthu(tmp2, 2, R4_ARG2); |
1312 |
1424 __ bdnz(l_5); |
1313 if (!VM_Version::has_vsx()) { |
|
1314 |
|
1315 __ bind(l_8); |
|
1316 // Use unrolled version for mass copying (copy 16 elements a time). |
|
1317 // Load feeding store gets zero latency on Power6, however not on Power5. |
|
1318 // Therefore, the following sequence is made for the good of both. |
|
1319 __ ld(tmp1, 0, R3_ARG1); |
|
1320 __ ld(tmp2, 8, R3_ARG1); |
|
1321 __ ld(tmp3, 16, R3_ARG1); |
|
1322 __ ld(tmp4, 24, R3_ARG1); |
|
1323 __ std(tmp1, 0, R4_ARG2); |
|
1324 __ std(tmp2, 8, R4_ARG2); |
|
1325 __ std(tmp3, 16, R4_ARG2); |
|
1326 __ std(tmp4, 24, R4_ARG2); |
|
1327 __ addi(R3_ARG1, R3_ARG1, 32); |
|
1328 __ addi(R4_ARG2, R4_ARG2, 32); |
|
1329 __ bdnz(l_8); |
|
1330 |
|
1331 } else { // Processor supports VSX, so use it to mass copy. |
|
1332 |
|
1333 // Prefetch src data into L2 cache. |
|
1334 __ dcbt(R3_ARG1, 0); |
|
1335 |
|
1336 // If supported set DSCR pre-fetch to deepest. |
|
1337 if (VM_Version::has_mfdscr()) { |
|
1338 __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7); |
|
1339 __ mtdscr(tmp2); |
|
1340 } |
|
1341 __ li(tmp1, 16); |
|
1342 |
|
1343 // Backbranch target aligned to 32-byte. It's not aligned 16-byte |
|
1344 // as loop contains < 8 instructions that fit inside a single |
|
1345 // i-cache sector. |
|
1346 __ align(32); |
|
1347 |
|
1348 __ bind(l_9); |
|
1349 // Use loop with VSX load/store instructions to |
|
1350 // copy 16 elements a time. |
|
1351 __ lxvd2x(tmp_vsr1, R3_ARG1); // Load from src. |
|
1352 __ stxvd2x(tmp_vsr1, R4_ARG2); // Store to dst. |
|
1353 __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1); // Load from src + 16. |
|
1354 __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16. |
|
1355 __ addi(R3_ARG1, R3_ARG1, 32); // Update src+=32. |
|
1356 __ addi(R4_ARG2, R4_ARG2, 32); // Update dsc+=32. |
|
1357 __ bdnz(l_9); // Dec CTR and loop if not zero. |
|
1358 |
|
1359 // Restore DSCR pre-fetch value. |
|
1360 if (VM_Version::has_mfdscr()) { |
|
1361 __ load_const_optimized(tmp2, VM_Version::_dscr_val); |
|
1362 __ mtdscr(tmp2); |
|
1363 } |
|
1364 |
|
1365 } |
1425 } |
1366 } // FasterArrayCopy |
1426 } |
1367 __ bind(l_6); |
1427 |
1368 |
|
1369 // copy 2 elements at a time |
|
1370 { // FasterArrayCopy |
|
1371 __ cmpwi(CCR0, R5_ARG3, 2); |
|
1372 __ blt(CCR0, l_1); |
|
1373 __ srdi(tmp1, R5_ARG3, 1); |
|
1374 __ andi_(R5_ARG3, R5_ARG3, 1); |
|
1375 |
|
1376 __ addi(R3_ARG1, R3_ARG1, -4); |
|
1377 __ addi(R4_ARG2, R4_ARG2, -4); |
|
1378 __ mtctr(tmp1); |
|
1379 |
|
1380 __ bind(l_3); |
|
1381 __ lwzu(tmp2, 4, R3_ARG1); |
|
1382 __ stwu(tmp2, 4, R4_ARG2); |
|
1383 __ bdnz(l_3); |
|
1384 |
|
1385 __ addi(R3_ARG1, R3_ARG1, 4); |
|
1386 __ addi(R4_ARG2, R4_ARG2, 4); |
|
1387 } |
|
1388 |
|
1389 // do single element copy |
|
1390 __ bind(l_1); |
|
1391 __ cmpwi(CCR0, R5_ARG3, 0); |
|
1392 __ beq(CCR0, l_4); |
|
1393 |
|
1394 { // FasterArrayCopy |
|
1395 __ mtctr(R5_ARG3); |
|
1396 __ addi(R3_ARG1, R3_ARG1, -2); |
|
1397 __ addi(R4_ARG2, R4_ARG2, -2); |
|
1398 |
|
1399 __ bind(l_5); |
|
1400 __ lhzu(tmp2, 2, R3_ARG1); |
|
1401 __ sthu(tmp2, 2, R4_ARG2); |
|
1402 __ bdnz(l_5); |
|
1403 } |
|
1404 __ bind(l_4); |
1428 __ bind(l_4); |
1405 __ li(R3_RET, 0); // return 0 |
1429 __ li(R3_RET, 0); // return 0 |
1406 __ blr(); |
1430 __ blr(); |
1407 |
1431 |
1408 return start; |
1432 return start; |