src/hotspot/cpu/ppc/stubGenerator_ppc.cpp
changeset 55490 3f3dc00a69a5
parent 53483 60add902a57a
child 58904 1f7981ef8779
equal deleted inserted replaced
55489:c749ecf599c0 55490:3f3dc00a69a5
   950     __ bc(Assembler::bcondCRbiIs1, Assembler::bi0(CCR0, Assembler::less), no_overlap_target);
   950     __ bc(Assembler::bcondCRbiIs1, Assembler::bi0(CCR0, Assembler::less), no_overlap_target);
   951 
   951 
   952     // need to copy backwards
   952     // need to copy backwards
   953   }
   953   }
   954 
   954 
       
   955   // This is common errorexit stub for UnsafeCopyMemory.
       
   956   address generate_unsafecopy_common_error_exit() {
       
   957     address start_pc = __ pc();
       
   958     Register tmp1 = R6_ARG4;
       
   959     // probably copy stub would have changed value reset it.
       
   960     if (VM_Version::has_mfdscr()) {
       
   961       __ load_const_optimized(tmp1, VM_Version::_dscr_val);
       
   962       __ mtdscr(tmp1);
       
   963     }
       
   964     __ li(R3_RET, 0); // return 0
       
   965     __ blr();
       
   966     return start_pc;
       
   967   }
       
   968 
   955   // The guideline in the implementations of generate_disjoint_xxx_copy
   969   // The guideline in the implementations of generate_disjoint_xxx_copy
   956   // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with
   970   // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with
   957   // single instructions, but to avoid alignment interrupts (see subsequent
   971   // single instructions, but to avoid alignment interrupts (see subsequent
   958   // comment). Furthermore, we try to minimize misaligned access, even
   972   // comment). Furthermore, we try to minimize misaligned access, even
   959   // though they cause no alignment interrupt.
   973   // though they cause no alignment interrupt.
   987 
  1001 
   988     VectorSRegister tmp_vsr1  = VSR1;
  1002     VectorSRegister tmp_vsr1  = VSR1;
   989     VectorSRegister tmp_vsr2  = VSR2;
  1003     VectorSRegister tmp_vsr2  = VSR2;
   990 
  1004 
   991     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10;
  1005     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10;
   992 
  1006     {
   993     // Don't try anything fancy if arrays don't have many elements.
  1007       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
   994     __ li(tmp3, 0);
  1008       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
   995     __ cmpwi(CCR0, R5_ARG3, 17);
  1009 
   996     __ ble(CCR0, l_6); // copy 4 at a time
  1010       // Don't try anything fancy if arrays don't have many elements.
   997 
  1011       __ li(tmp3, 0);
   998     if (!aligned) {
  1012       __ cmpwi(CCR0, R5_ARG3, 17);
   999       __ xorr(tmp1, R3_ARG1, R4_ARG2);
  1013       __ ble(CCR0, l_6); // copy 4 at a time
  1000       __ andi_(tmp1, tmp1, 3);
  1014 
  1001       __ bne(CCR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
  1015       if (!aligned) {
  1002 
  1016         __ xorr(tmp1, R3_ARG1, R4_ARG2);
  1003       // Copy elements if necessary to align to 4 bytes.
  1017         __ andi_(tmp1, tmp1, 3);
  1004       __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
  1018         __ bne(CCR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
  1005       __ andi_(tmp1, tmp1, 3);
  1019 
  1006       __ beq(CCR0, l_2);
  1020         // Copy elements if necessary to align to 4 bytes.
  1007 
  1021         __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
  1008       __ subf(R5_ARG3, tmp1, R5_ARG3);
  1022         __ andi_(tmp1, tmp1, 3);
  1009       __ bind(l_9);
  1023         __ beq(CCR0, l_2);
  1010       __ lbz(tmp2, 0, R3_ARG1);
  1024 
  1011       __ addic_(tmp1, tmp1, -1);
  1025         __ subf(R5_ARG3, tmp1, R5_ARG3);
  1012       __ stb(tmp2, 0, R4_ARG2);
  1026         __ bind(l_9);
  1013       __ addi(R3_ARG1, R3_ARG1, 1);
  1027         __ lbz(tmp2, 0, R3_ARG1);
  1014       __ addi(R4_ARG2, R4_ARG2, 1);
  1028         __ addic_(tmp1, tmp1, -1);
  1015       __ bne(CCR0, l_9);
  1029         __ stb(tmp2, 0, R4_ARG2);
  1016 
  1030         __ addi(R3_ARG1, R3_ARG1, 1);
  1017       __ bind(l_2);
  1031         __ addi(R4_ARG2, R4_ARG2, 1);
  1018     }
  1032         __ bne(CCR0, l_9);
  1019 
  1033 
  1020     // copy 8 elements at a time
  1034         __ bind(l_2);
  1021     __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
       
  1022     __ andi_(tmp1, tmp2, 7);
       
  1023     __ bne(CCR0, l_7); // not same alignment -> to or from is aligned -> copy 8
       
  1024 
       
  1025     // copy a 2-element word if necessary to align to 8 bytes
       
  1026     __ andi_(R0, R3_ARG1, 7);
       
  1027     __ beq(CCR0, l_7);
       
  1028 
       
  1029     __ lwzx(tmp2, R3_ARG1, tmp3);
       
  1030     __ addi(R5_ARG3, R5_ARG3, -4);
       
  1031     __ stwx(tmp2, R4_ARG2, tmp3);
       
  1032     { // FasterArrayCopy
       
  1033       __ addi(R3_ARG1, R3_ARG1, 4);
       
  1034       __ addi(R4_ARG2, R4_ARG2, 4);
       
  1035     }
       
  1036     __ bind(l_7);
       
  1037 
       
  1038     { // FasterArrayCopy
       
  1039       __ cmpwi(CCR0, R5_ARG3, 31);
       
  1040       __ ble(CCR0, l_6); // copy 2 at a time if less than 32 elements remain
       
  1041 
       
  1042       __ srdi(tmp1, R5_ARG3, 5);
       
  1043       __ andi_(R5_ARG3, R5_ARG3, 31);
       
  1044       __ mtctr(tmp1);
       
  1045 
       
  1046      if (!VM_Version::has_vsx()) {
       
  1047 
       
  1048       __ bind(l_8);
       
  1049       // Use unrolled version for mass copying (copy 32 elements a time)
       
  1050       // Load feeding store gets zero latency on Power6, however not on Power5.
       
  1051       // Therefore, the following sequence is made for the good of both.
       
  1052       __ ld(tmp1, 0, R3_ARG1);
       
  1053       __ ld(tmp2, 8, R3_ARG1);
       
  1054       __ ld(tmp3, 16, R3_ARG1);
       
  1055       __ ld(tmp4, 24, R3_ARG1);
       
  1056       __ std(tmp1, 0, R4_ARG2);
       
  1057       __ std(tmp2, 8, R4_ARG2);
       
  1058       __ std(tmp3, 16, R4_ARG2);
       
  1059       __ std(tmp4, 24, R4_ARG2);
       
  1060       __ addi(R3_ARG1, R3_ARG1, 32);
       
  1061       __ addi(R4_ARG2, R4_ARG2, 32);
       
  1062       __ bdnz(l_8);
       
  1063 
       
  1064     } else { // Processor supports VSX, so use it to mass copy.
       
  1065 
       
  1066       // Prefetch the data into the L2 cache.
       
  1067       __ dcbt(R3_ARG1, 0);
       
  1068 
       
  1069       // If supported set DSCR pre-fetch to deepest.
       
  1070       if (VM_Version::has_mfdscr()) {
       
  1071         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
       
  1072         __ mtdscr(tmp2);
       
  1073       }
  1035       }
  1074 
  1036 
  1075       __ li(tmp1, 16);
  1037       // copy 8 elements at a time
  1076 
  1038       __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
  1077       // Backbranch target aligned to 32-byte. Not 16-byte align as
  1039       __ andi_(tmp1, tmp2, 7);
  1078       // loop contains < 8 instructions that fit inside a single
  1040       __ bne(CCR0, l_7); // not same alignment -> to or from is aligned -> copy 8
  1079       // i-cache sector.
  1041 
  1080       __ align(32);
  1042       // copy a 2-element word if necessary to align to 8 bytes
  1081 
  1043       __ andi_(R0, R3_ARG1, 7);
  1082       __ bind(l_10);
  1044       __ beq(CCR0, l_7);
  1083       // Use loop with VSX load/store instructions to
  1045 
  1084       // copy 32 elements a time.
  1046       __ lwzx(tmp2, R3_ARG1, tmp3);
  1085       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
  1047       __ addi(R5_ARG3, R5_ARG3, -4);
  1086       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
  1048       __ stwx(tmp2, R4_ARG2, tmp3);
  1087       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
  1049       { // FasterArrayCopy
  1088       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
  1050         __ addi(R3_ARG1, R3_ARG1, 4);
  1089       __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
  1051         __ addi(R4_ARG2, R4_ARG2, 4);
  1090       __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
       
  1091       __ bdnz(l_10);                       // Dec CTR and loop if not zero.
       
  1092 
       
  1093       // Restore DSCR pre-fetch value.
       
  1094       if (VM_Version::has_mfdscr()) {
       
  1095         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
       
  1096         __ mtdscr(tmp2);
       
  1097       }
  1052       }
  1098 
  1053       __ bind(l_7);
  1099     } // VSX
  1054 
  1100    } // FasterArrayCopy
  1055       { // FasterArrayCopy
  1101 
  1056         __ cmpwi(CCR0, R5_ARG3, 31);
  1102     __ bind(l_6);
  1057         __ ble(CCR0, l_6); // copy 2 at a time if less than 32 elements remain
  1103 
  1058 
  1104     // copy 4 elements at a time
  1059         __ srdi(tmp1, R5_ARG3, 5);
  1105     __ cmpwi(CCR0, R5_ARG3, 4);
  1060         __ andi_(R5_ARG3, R5_ARG3, 31);
  1106     __ blt(CCR0, l_1);
  1061         __ mtctr(tmp1);
  1107     __ srdi(tmp1, R5_ARG3, 2);
  1062 
  1108     __ mtctr(tmp1); // is > 0
  1063        if (!VM_Version::has_vsx()) {
  1109     __ andi_(R5_ARG3, R5_ARG3, 3);
  1064 
  1110 
  1065         __ bind(l_8);
  1111     { // FasterArrayCopy
  1066         // Use unrolled version for mass copying (copy 32 elements a time)
  1112       __ addi(R3_ARG1, R3_ARG1, -4);
  1067         // Load feeding store gets zero latency on Power6, however not on Power5.
  1113       __ addi(R4_ARG2, R4_ARG2, -4);
  1068         // Therefore, the following sequence is made for the good of both.
  1114       __ bind(l_3);
  1069         __ ld(tmp1, 0, R3_ARG1);
  1115       __ lwzu(tmp2, 4, R3_ARG1);
  1070         __ ld(tmp2, 8, R3_ARG1);
  1116       __ stwu(tmp2, 4, R4_ARG2);
  1071         __ ld(tmp3, 16, R3_ARG1);
  1117       __ bdnz(l_3);
  1072         __ ld(tmp4, 24, R3_ARG1);
  1118       __ addi(R3_ARG1, R3_ARG1, 4);
  1073         __ std(tmp1, 0, R4_ARG2);
  1119       __ addi(R4_ARG2, R4_ARG2, 4);
  1074         __ std(tmp2, 8, R4_ARG2);
  1120     }
  1075         __ std(tmp3, 16, R4_ARG2);
  1121 
  1076         __ std(tmp4, 24, R4_ARG2);
  1122     // do single element copy
  1077         __ addi(R3_ARG1, R3_ARG1, 32);
  1123     __ bind(l_1);
  1078         __ addi(R4_ARG2, R4_ARG2, 32);
  1124     __ cmpwi(CCR0, R5_ARG3, 0);
  1079         __ bdnz(l_8);
  1125     __ beq(CCR0, l_4);
  1080 
  1126 
  1081       } else { // Processor supports VSX, so use it to mass copy.
  1127     { // FasterArrayCopy
  1082 
  1128       __ mtctr(R5_ARG3);
  1083         // Prefetch the data into the L2 cache.
  1129       __ addi(R3_ARG1, R3_ARG1, -1);
  1084         __ dcbt(R3_ARG1, 0);
  1130       __ addi(R4_ARG2, R4_ARG2, -1);
  1085 
  1131 
  1086         // If supported set DSCR pre-fetch to deepest.
  1132       __ bind(l_5);
  1087         if (VM_Version::has_mfdscr()) {
  1133       __ lbzu(tmp2, 1, R3_ARG1);
  1088           __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
  1134       __ stbu(tmp2, 1, R4_ARG2);
  1089           __ mtdscr(tmp2);
  1135       __ bdnz(l_5);
  1090         }
       
  1091 
       
  1092         __ li(tmp1, 16);
       
  1093 
       
  1094         // Backbranch target aligned to 32-byte. Not 16-byte align as
       
  1095         // loop contains < 8 instructions that fit inside a single
       
  1096         // i-cache sector.
       
  1097         __ align(32);
       
  1098 
       
  1099         __ bind(l_10);
       
  1100         // Use loop with VSX load/store instructions to
       
  1101         // copy 32 elements a time.
       
  1102         __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
       
  1103         __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
       
  1104         __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
       
  1105         __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
       
  1106         __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
       
  1107         __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
       
  1108         __ bdnz(l_10);                       // Dec CTR and loop if not zero.
       
  1109 
       
  1110         // Restore DSCR pre-fetch value.
       
  1111         if (VM_Version::has_mfdscr()) {
       
  1112           __ load_const_optimized(tmp2, VM_Version::_dscr_val);
       
  1113           __ mtdscr(tmp2);
       
  1114         }
       
  1115 
       
  1116       } // VSX
       
  1117      } // FasterArrayCopy
       
  1118 
       
  1119       __ bind(l_6);
       
  1120 
       
  1121       // copy 4 elements at a time
       
  1122       __ cmpwi(CCR0, R5_ARG3, 4);
       
  1123       __ blt(CCR0, l_1);
       
  1124       __ srdi(tmp1, R5_ARG3, 2);
       
  1125       __ mtctr(tmp1); // is > 0
       
  1126       __ andi_(R5_ARG3, R5_ARG3, 3);
       
  1127 
       
  1128       { // FasterArrayCopy
       
  1129         __ addi(R3_ARG1, R3_ARG1, -4);
       
  1130         __ addi(R4_ARG2, R4_ARG2, -4);
       
  1131         __ bind(l_3);
       
  1132         __ lwzu(tmp2, 4, R3_ARG1);
       
  1133         __ stwu(tmp2, 4, R4_ARG2);
       
  1134         __ bdnz(l_3);
       
  1135         __ addi(R3_ARG1, R3_ARG1, 4);
       
  1136         __ addi(R4_ARG2, R4_ARG2, 4);
       
  1137       }
       
  1138 
       
  1139       // do single element copy
       
  1140       __ bind(l_1);
       
  1141       __ cmpwi(CCR0, R5_ARG3, 0);
       
  1142       __ beq(CCR0, l_4);
       
  1143 
       
  1144       { // FasterArrayCopy
       
  1145         __ mtctr(R5_ARG3);
       
  1146         __ addi(R3_ARG1, R3_ARG1, -1);
       
  1147         __ addi(R4_ARG2, R4_ARG2, -1);
       
  1148 
       
  1149         __ bind(l_5);
       
  1150         __ lbzu(tmp2, 1, R3_ARG1);
       
  1151         __ stbu(tmp2, 1, R4_ARG2);
       
  1152         __ bdnz(l_5);
       
  1153       }
  1136     }
  1154     }
  1137 
  1155 
  1138     __ bind(l_4);
  1156     __ bind(l_4);
  1139     __ li(R3_RET, 0); // return 0
  1157     __ li(R3_RET, 0); // return 0
  1140     __ blr();
  1158     __ blr();
  1165 
  1183 
  1166     array_overlap_test(nooverlap_target, 0);
  1184     array_overlap_test(nooverlap_target, 0);
  1167     // Do reverse copy. We assume the case of actual overlap is rare enough
  1185     // Do reverse copy. We assume the case of actual overlap is rare enough
  1168     // that we don't have to optimize it.
  1186     // that we don't have to optimize it.
  1169     Label l_1, l_2;
  1187     Label l_1, l_2;
  1170 
  1188     {
  1171     __ b(l_2);
  1189       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
  1172     __ bind(l_1);
  1190       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
  1173     __ stbx(tmp1, R4_ARG2, R5_ARG3);
  1191       __ b(l_2);
  1174     __ bind(l_2);
  1192       __ bind(l_1);
  1175     __ addic_(R5_ARG3, R5_ARG3, -1);
  1193       __ stbx(tmp1, R4_ARG2, R5_ARG3);
  1176     __ lbzx(tmp1, R3_ARG1, R5_ARG3);
  1194       __ bind(l_2);
  1177     __ bge(CCR0, l_1);
  1195       __ addic_(R5_ARG3, R5_ARG3, -1);
  1178 
  1196       __ lbzx(tmp1, R3_ARG1, R5_ARG3);
       
  1197       __ bge(CCR0, l_1);
       
  1198     }
  1179     __ li(R3_RET, 0); // return 0
  1199     __ li(R3_RET, 0); // return 0
  1180     __ blr();
  1200     __ blr();
  1181 
  1201 
  1182     return start;
  1202     return start;
  1183   }
  1203   }
  1250 
  1270 
  1251     address start = __ function_entry();
  1271     address start = __ function_entry();
  1252     assert_positive_int(R5_ARG3);
  1272     assert_positive_int(R5_ARG3);
  1253 
  1273 
  1254     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
  1274     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
  1255 
  1275     {
  1256     // don't try anything fancy if arrays don't have many elements
  1276       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
  1257     __ li(tmp3, 0);
  1277       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
  1258     __ cmpwi(CCR0, R5_ARG3, 9);
  1278       // don't try anything fancy if arrays don't have many elements
  1259     __ ble(CCR0, l_6); // copy 2 at a time
  1279       __ li(tmp3, 0);
  1260 
  1280       __ cmpwi(CCR0, R5_ARG3, 9);
  1261     if (!aligned) {
  1281       __ ble(CCR0, l_6); // copy 2 at a time
  1262       __ xorr(tmp1, R3_ARG1, R4_ARG2);
  1282 
  1263       __ andi_(tmp1, tmp1, 3);
  1283       if (!aligned) {
  1264       __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
  1284         __ xorr(tmp1, R3_ARG1, R4_ARG2);
  1265 
  1285         __ andi_(tmp1, tmp1, 3);
  1266       // At this point it is guaranteed that both, from and to have the same alignment mod 4.
  1286         __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
  1267 
  1287 
  1268       // Copy 1 element if necessary to align to 4 bytes.
  1288         // At this point it is guaranteed that both, from and to have the same alignment mod 4.
  1269       __ andi_(tmp1, R3_ARG1, 3);
  1289 
  1270       __ beq(CCR0, l_2);
  1290         // Copy 1 element if necessary to align to 4 bytes.
  1271 
  1291         __ andi_(tmp1, R3_ARG1, 3);
  1272       __ lhz(tmp2, 0, R3_ARG1);
  1292         __ beq(CCR0, l_2);
  1273       __ addi(R3_ARG1, R3_ARG1, 2);
  1293 
  1274       __ sth(tmp2, 0, R4_ARG2);
  1294         __ lhz(tmp2, 0, R3_ARG1);
  1275       __ addi(R4_ARG2, R4_ARG2, 2);
  1295         __ addi(R3_ARG1, R3_ARG1, 2);
  1276       __ addi(R5_ARG3, R5_ARG3, -1);
  1296         __ sth(tmp2, 0, R4_ARG2);
  1277       __ bind(l_2);
  1297         __ addi(R4_ARG2, R4_ARG2, 2);
  1278 
  1298         __ addi(R5_ARG3, R5_ARG3, -1);
  1279       // At this point the positions of both, from and to, are at least 4 byte aligned.
  1299         __ bind(l_2);
  1280 
  1300 
  1281       // Copy 4 elements at a time.
  1301         // At this point the positions of both, from and to, are at least 4 byte aligned.
  1282       // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
  1302 
  1283       __ xorr(tmp2, R3_ARG1, R4_ARG2);
  1303         // Copy 4 elements at a time.
  1284       __ andi_(tmp1, tmp2, 7);
  1304         // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
  1285       __ bne(CCR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
  1305         __ xorr(tmp2, R3_ARG1, R4_ARG2);
  1286 
  1306         __ andi_(tmp1, tmp2, 7);
  1287       // Copy a 2-element word if necessary to align to 8 bytes.
  1307         __ bne(CCR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
  1288       __ andi_(R0, R3_ARG1, 7);
  1308 
  1289       __ beq(CCR0, l_7);
  1309         // Copy a 2-element word if necessary to align to 8 bytes.
  1290 
  1310         __ andi_(R0, R3_ARG1, 7);
  1291       __ lwzx(tmp2, R3_ARG1, tmp3);
  1311         __ beq(CCR0, l_7);
  1292       __ addi(R5_ARG3, R5_ARG3, -2);
  1312 
  1293       __ stwx(tmp2, R4_ARG2, tmp3);
  1313         __ lwzx(tmp2, R3_ARG1, tmp3);
       
  1314         __ addi(R5_ARG3, R5_ARG3, -2);
       
  1315         __ stwx(tmp2, R4_ARG2, tmp3);
       
  1316         { // FasterArrayCopy
       
  1317           __ addi(R3_ARG1, R3_ARG1, 4);
       
  1318           __ addi(R4_ARG2, R4_ARG2, 4);
       
  1319         }
       
  1320       }
       
  1321 
       
  1322       __ bind(l_7);
       
  1323 
       
  1324       // Copy 4 elements at a time; either the loads or the stores can
       
  1325       // be unaligned if aligned == false.
       
  1326 
  1294       { // FasterArrayCopy
  1327       { // FasterArrayCopy
       
  1328         __ cmpwi(CCR0, R5_ARG3, 15);
       
  1329         __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain
       
  1330 
       
  1331         __ srdi(tmp1, R5_ARG3, 4);
       
  1332         __ andi_(R5_ARG3, R5_ARG3, 15);
       
  1333         __ mtctr(tmp1);
       
  1334 
       
  1335         if (!VM_Version::has_vsx()) {
       
  1336 
       
  1337           __ bind(l_8);
       
  1338           // Use unrolled version for mass copying (copy 16 elements a time).
       
  1339           // Load feeding store gets zero latency on Power6, however not on Power5.
       
  1340           // Therefore, the following sequence is made for the good of both.
       
  1341           __ ld(tmp1, 0, R3_ARG1);
       
  1342           __ ld(tmp2, 8, R3_ARG1);
       
  1343           __ ld(tmp3, 16, R3_ARG1);
       
  1344           __ ld(tmp4, 24, R3_ARG1);
       
  1345           __ std(tmp1, 0, R4_ARG2);
       
  1346           __ std(tmp2, 8, R4_ARG2);
       
  1347           __ std(tmp3, 16, R4_ARG2);
       
  1348           __ std(tmp4, 24, R4_ARG2);
       
  1349           __ addi(R3_ARG1, R3_ARG1, 32);
       
  1350           __ addi(R4_ARG2, R4_ARG2, 32);
       
  1351           __ bdnz(l_8);
       
  1352 
       
  1353         } else { // Processor supports VSX, so use it to mass copy.
       
  1354 
       
  1355           // Prefetch src data into L2 cache.
       
  1356           __ dcbt(R3_ARG1, 0);
       
  1357 
       
  1358           // If supported set DSCR pre-fetch to deepest.
       
  1359           if (VM_Version::has_mfdscr()) {
       
  1360             __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
       
  1361             __ mtdscr(tmp2);
       
  1362           }
       
  1363           __ li(tmp1, 16);
       
  1364 
       
  1365           // Backbranch target aligned to 32-byte. It's not aligned 16-byte
       
  1366           // as loop contains < 8 instructions that fit inside a single
       
  1367           // i-cache sector.
       
  1368           __ align(32);
       
  1369 
       
  1370           __ bind(l_9);
       
  1371           // Use loop with VSX load/store instructions to
       
  1372           // copy 16 elements a time.
       
  1373           __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load from src.
       
  1374           __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst.
       
  1375           __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1);  // Load from src + 16.
       
  1376           __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
       
  1377           __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32.
       
  1378           __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32.
       
  1379           __ bdnz(l_9);                        // Dec CTR and loop if not zero.
       
  1380 
       
  1381           // Restore DSCR pre-fetch value.
       
  1382           if (VM_Version::has_mfdscr()) {
       
  1383             __ load_const_optimized(tmp2, VM_Version::_dscr_val);
       
  1384             __ mtdscr(tmp2);
       
  1385           }
       
  1386 
       
  1387         }
       
  1388       } // FasterArrayCopy
       
  1389       __ bind(l_6);
       
  1390 
       
  1391       // copy 2 elements at a time
       
  1392       { // FasterArrayCopy
       
  1393         __ cmpwi(CCR0, R5_ARG3, 2);
       
  1394         __ blt(CCR0, l_1);
       
  1395         __ srdi(tmp1, R5_ARG3, 1);
       
  1396         __ andi_(R5_ARG3, R5_ARG3, 1);
       
  1397 
       
  1398         __ addi(R3_ARG1, R3_ARG1, -4);
       
  1399         __ addi(R4_ARG2, R4_ARG2, -4);
       
  1400         __ mtctr(tmp1);
       
  1401 
       
  1402         __ bind(l_3);
       
  1403         __ lwzu(tmp2, 4, R3_ARG1);
       
  1404         __ stwu(tmp2, 4, R4_ARG2);
       
  1405         __ bdnz(l_3);
       
  1406 
  1295         __ addi(R3_ARG1, R3_ARG1, 4);
  1407         __ addi(R3_ARG1, R3_ARG1, 4);
  1296         __ addi(R4_ARG2, R4_ARG2, 4);
  1408         __ addi(R4_ARG2, R4_ARG2, 4);
  1297       }
  1409       }
  1298     }
  1410 
  1299 
  1411       // do single element copy
  1300     __ bind(l_7);
  1412       __ bind(l_1);
  1301 
  1413       __ cmpwi(CCR0, R5_ARG3, 0);
  1302     // Copy 4 elements at a time; either the loads or the stores can
  1414       __ beq(CCR0, l_4);
  1303     // be unaligned if aligned == false.
  1415 
  1304 
  1416       { // FasterArrayCopy
  1305     { // FasterArrayCopy
  1417         __ mtctr(R5_ARG3);
  1306       __ cmpwi(CCR0, R5_ARG3, 15);
  1418         __ addi(R3_ARG1, R3_ARG1, -2);
  1307       __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain
  1419         __ addi(R4_ARG2, R4_ARG2, -2);
  1308 
  1420 
  1309       __ srdi(tmp1, R5_ARG3, 4);
  1421         __ bind(l_5);
  1310       __ andi_(R5_ARG3, R5_ARG3, 15);
  1422         __ lhzu(tmp2, 2, R3_ARG1);
  1311       __ mtctr(tmp1);
  1423         __ sthu(tmp2, 2, R4_ARG2);
  1312 
  1424         __ bdnz(l_5);
  1313       if (!VM_Version::has_vsx()) {
       
  1314 
       
  1315         __ bind(l_8);
       
  1316         // Use unrolled version for mass copying (copy 16 elements a time).
       
  1317         // Load feeding store gets zero latency on Power6, however not on Power5.
       
  1318         // Therefore, the following sequence is made for the good of both.
       
  1319         __ ld(tmp1, 0, R3_ARG1);
       
  1320         __ ld(tmp2, 8, R3_ARG1);
       
  1321         __ ld(tmp3, 16, R3_ARG1);
       
  1322         __ ld(tmp4, 24, R3_ARG1);
       
  1323         __ std(tmp1, 0, R4_ARG2);
       
  1324         __ std(tmp2, 8, R4_ARG2);
       
  1325         __ std(tmp3, 16, R4_ARG2);
       
  1326         __ std(tmp4, 24, R4_ARG2);
       
  1327         __ addi(R3_ARG1, R3_ARG1, 32);
       
  1328         __ addi(R4_ARG2, R4_ARG2, 32);
       
  1329         __ bdnz(l_8);
       
  1330 
       
  1331       } else { // Processor supports VSX, so use it to mass copy.
       
  1332 
       
  1333         // Prefetch src data into L2 cache.
       
  1334         __ dcbt(R3_ARG1, 0);
       
  1335 
       
  1336         // If supported set DSCR pre-fetch to deepest.
       
  1337         if (VM_Version::has_mfdscr()) {
       
  1338           __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
       
  1339           __ mtdscr(tmp2);
       
  1340         }
       
  1341         __ li(tmp1, 16);
       
  1342 
       
  1343         // Backbranch target aligned to 32-byte. It's not aligned 16-byte
       
  1344         // as loop contains < 8 instructions that fit inside a single
       
  1345         // i-cache sector.
       
  1346         __ align(32);
       
  1347 
       
  1348         __ bind(l_9);
       
  1349         // Use loop with VSX load/store instructions to
       
  1350         // copy 16 elements a time.
       
  1351         __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load from src.
       
  1352         __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst.
       
  1353         __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1);  // Load from src + 16.
       
  1354         __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
       
  1355         __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32.
       
  1356         __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32.
       
  1357         __ bdnz(l_9);                        // Dec CTR and loop if not zero.
       
  1358 
       
  1359         // Restore DSCR pre-fetch value.
       
  1360         if (VM_Version::has_mfdscr()) {
       
  1361           __ load_const_optimized(tmp2, VM_Version::_dscr_val);
       
  1362           __ mtdscr(tmp2);
       
  1363         }
       
  1364 
       
  1365       }
  1425       }
  1366     } // FasterArrayCopy
  1426     }
  1367     __ bind(l_6);
  1427 
  1368 
       
  1369     // copy 2 elements at a time
       
  1370     { // FasterArrayCopy
       
  1371       __ cmpwi(CCR0, R5_ARG3, 2);
       
  1372       __ blt(CCR0, l_1);
       
  1373       __ srdi(tmp1, R5_ARG3, 1);
       
  1374       __ andi_(R5_ARG3, R5_ARG3, 1);
       
  1375 
       
  1376       __ addi(R3_ARG1, R3_ARG1, -4);
       
  1377       __ addi(R4_ARG2, R4_ARG2, -4);
       
  1378       __ mtctr(tmp1);
       
  1379 
       
  1380       __ bind(l_3);
       
  1381       __ lwzu(tmp2, 4, R3_ARG1);
       
  1382       __ stwu(tmp2, 4, R4_ARG2);
       
  1383       __ bdnz(l_3);
       
  1384 
       
  1385       __ addi(R3_ARG1, R3_ARG1, 4);
       
  1386       __ addi(R4_ARG2, R4_ARG2, 4);
       
  1387     }
       
  1388 
       
  1389     // do single element copy
       
  1390     __ bind(l_1);
       
  1391     __ cmpwi(CCR0, R5_ARG3, 0);
       
  1392     __ beq(CCR0, l_4);
       
  1393 
       
  1394     { // FasterArrayCopy
       
  1395       __ mtctr(R5_ARG3);
       
  1396       __ addi(R3_ARG1, R3_ARG1, -2);
       
  1397       __ addi(R4_ARG2, R4_ARG2, -2);
       
  1398 
       
  1399       __ bind(l_5);
       
  1400       __ lhzu(tmp2, 2, R3_ARG1);
       
  1401       __ sthu(tmp2, 2, R4_ARG2);
       
  1402       __ bdnz(l_5);
       
  1403     }
       
  1404     __ bind(l_4);
  1428     __ bind(l_4);
  1405     __ li(R3_RET, 0); // return 0
  1429     __ li(R3_RET, 0); // return 0
  1406     __ blr();
  1430     __ blr();
  1407 
  1431 
  1408     return start;
  1432     return start;
  1430       STUB_ENTRY(jshort_disjoint_arraycopy);
  1454       STUB_ENTRY(jshort_disjoint_arraycopy);
  1431 
  1455 
  1432     array_overlap_test(nooverlap_target, 1);
  1456     array_overlap_test(nooverlap_target, 1);
  1433 
  1457 
  1434     Label l_1, l_2;
  1458     Label l_1, l_2;
  1435     __ sldi(tmp1, R5_ARG3, 1);
  1459     {
  1436     __ b(l_2);
  1460       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
  1437     __ bind(l_1);
  1461       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
  1438     __ sthx(tmp2, R4_ARG2, tmp1);
  1462       __ sldi(tmp1, R5_ARG3, 1);
  1439     __ bind(l_2);
  1463       __ b(l_2);
  1440     __ addic_(tmp1, tmp1, -2);
  1464       __ bind(l_1);
  1441     __ lhzx(tmp2, R3_ARG1, tmp1);
  1465       __ sthx(tmp2, R4_ARG2, tmp1);
  1442     __ bge(CCR0, l_1);
  1466       __ bind(l_2);
  1443 
  1467       __ addic_(tmp1, tmp1, -2);
       
  1468       __ lhzx(tmp2, R3_ARG1, tmp1);
       
  1469       __ bge(CCR0, l_1);
       
  1470     }
  1444     __ li(R3_RET, 0); // return 0
  1471     __ li(R3_RET, 0); // return 0
  1445     __ blr();
  1472     __ blr();
  1446 
  1473 
  1447     return start;
  1474     return start;
  1448   }
  1475   }
  1586   //
  1613   //
  1587   address generate_disjoint_int_copy(bool aligned, const char * name) {
  1614   address generate_disjoint_int_copy(bool aligned, const char * name) {
  1588     StubCodeMark mark(this, "StubRoutines", name);
  1615     StubCodeMark mark(this, "StubRoutines", name);
  1589     address start = __ function_entry();
  1616     address start = __ function_entry();
  1590     assert_positive_int(R5_ARG3);
  1617     assert_positive_int(R5_ARG3);
  1591     generate_disjoint_int_copy_core(aligned);
  1618     {
       
  1619       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
       
  1620       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
       
  1621       generate_disjoint_int_copy_core(aligned);
       
  1622     }
  1592     __ li(R3_RET, 0); // return 0
  1623     __ li(R3_RET, 0); // return 0
  1593     __ blr();
  1624     __ blr();
  1594     return start;
  1625     return start;
  1595   }
  1626   }
  1596 
  1627 
  1734     address nooverlap_target = aligned ?
  1765     address nooverlap_target = aligned ?
  1735       STUB_ENTRY(arrayof_jint_disjoint_arraycopy) :
  1766       STUB_ENTRY(arrayof_jint_disjoint_arraycopy) :
  1736       STUB_ENTRY(jint_disjoint_arraycopy);
  1767       STUB_ENTRY(jint_disjoint_arraycopy);
  1737 
  1768 
  1738     array_overlap_test(nooverlap_target, 2);
  1769     array_overlap_test(nooverlap_target, 2);
  1739 
  1770     {
  1740     generate_conjoint_int_copy_core(aligned);
  1771       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
       
  1772       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
       
  1773       generate_conjoint_int_copy_core(aligned);
       
  1774     }
  1741 
  1775 
  1742     __ li(R3_RET, 0); // return 0
  1776     __ li(R3_RET, 0); // return 0
  1743     __ blr();
  1777     __ blr();
  1744 
  1778 
  1745     return start;
  1779     return start;
  1857   //
  1891   //
  1858   address generate_disjoint_long_copy(bool aligned, const char * name) {
  1892   address generate_disjoint_long_copy(bool aligned, const char * name) {
  1859     StubCodeMark mark(this, "StubRoutines", name);
  1893     StubCodeMark mark(this, "StubRoutines", name);
  1860     address start = __ function_entry();
  1894     address start = __ function_entry();
  1861     assert_positive_int(R5_ARG3);
  1895     assert_positive_int(R5_ARG3);
  1862     generate_disjoint_long_copy_core(aligned);
  1896     {
       
  1897       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
       
  1898       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
       
  1899       generate_disjoint_long_copy_core(aligned);
       
  1900     }
  1863     __ li(R3_RET, 0); // return 0
  1901     __ li(R3_RET, 0); // return 0
  1864     __ blr();
  1902     __ blr();
  1865 
  1903 
  1866     return start;
  1904   return start;
  1867   }
  1905   }
  1868 
  1906 
  1869   // Generate core code for conjoint long copy (and oop copy on
  1907   // Generate core code for conjoint long copy (and oop copy on
  1870   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
  1908   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
  1871   // are assumed to be heapword aligned.
  1909   // are assumed to be heapword aligned.
  1984     address nooverlap_target = aligned ?
  2022     address nooverlap_target = aligned ?
  1985       STUB_ENTRY(arrayof_jlong_disjoint_arraycopy) :
  2023       STUB_ENTRY(arrayof_jlong_disjoint_arraycopy) :
  1986       STUB_ENTRY(jlong_disjoint_arraycopy);
  2024       STUB_ENTRY(jlong_disjoint_arraycopy);
  1987 
  2025 
  1988     array_overlap_test(nooverlap_target, 3);
  2026     array_overlap_test(nooverlap_target, 3);
  1989     generate_conjoint_long_copy_core(aligned);
  2027     {
  1990 
  2028       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
       
  2029       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
       
  2030       generate_conjoint_long_copy_core(aligned);
       
  2031     }
  1991     __ li(R3_RET, 0); // return 0
  2032     __ li(R3_RET, 0); // return 0
  1992     __ blr();
  2033     __ blr();
  1993 
  2034 
  1994     return start;
  2035     return start;
  1995   }
  2036   }
  3005   }
  3046   }
  3006 
  3047 
  3007   void generate_arraycopy_stubs() {
  3048   void generate_arraycopy_stubs() {
  3008     // Note: the disjoint stubs must be generated first, some of
  3049     // Note: the disjoint stubs must be generated first, some of
  3009     // the conjoint stubs use them.
  3050     // the conjoint stubs use them.
       
  3051 
       
  3052     address ucm_common_error_exit       =  generate_unsafecopy_common_error_exit();
       
  3053     UnsafeCopyMemory::set_common_exit_stub_pc(ucm_common_error_exit);
  3010 
  3054 
  3011     // non-aligned disjoint versions
  3055     // non-aligned disjoint versions
  3012     StubRoutines::_jbyte_disjoint_arraycopy       = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
  3056     StubRoutines::_jbyte_disjoint_arraycopy       = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
  3013     StubRoutines::_jshort_disjoint_arraycopy      = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
  3057     StubRoutines::_jshort_disjoint_arraycopy      = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
  3014     StubRoutines::_jint_disjoint_arraycopy        = generate_disjoint_int_copy(false, "jint_disjoint_arraycopy");
  3058     StubRoutines::_jint_disjoint_arraycopy        = generate_disjoint_int_copy(false, "jint_disjoint_arraycopy");
  3577       generate_initial();
  3621       generate_initial();
  3578     }
  3622     }
  3579   }
  3623   }
  3580 };
  3624 };
  3581 
  3625 
       
  3626 #define UCM_TABLE_MAX_ENTRIES 8
  3582 void StubGenerator_generate(CodeBuffer* code, bool all) {
  3627 void StubGenerator_generate(CodeBuffer* code, bool all) {
       
  3628   if (UnsafeCopyMemory::_table == NULL) {
       
  3629     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
       
  3630   }
  3583   StubGenerator g(code, all);
  3631   StubGenerator g(code, all);
  3584 }
  3632 }