hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp
changeset 31961 70adcff5840c
parent 31955 c6ac18ab3d6b
child 32596 8feecdee3156
child 32574 6c3b890aa5d9
equal deleted inserted replaced
31960:4e66771a3e0a 31961:70adcff5840c
  2433     __ ret(lr);
  2433     __ ret(lr);
  2434 
  2434 
  2435     return start;
  2435     return start;
  2436   }
  2436   }
  2437 
  2437 
       
  2438   /**
       
  2439    *  Arguments:
       
  2440    *
       
  2441    *  Input:
       
  2442    *  c_rarg0   - current state address
       
  2443    *  c_rarg1   - H key address
       
  2444    *  c_rarg2   - data address
       
  2445    *  c_rarg3   - number of blocks
       
  2446    *
       
  2447    *  Output:
       
  2448    *  Updated state at c_rarg0
       
  2449    */
       
  2450   address generate_ghash_processBlocks() {
       
  2451     __ align(CodeEntryAlignment);
       
  2452     Label L_ghash_loop, L_exit;
       
  2453 
       
  2454     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
       
  2455     address start = __ pc();
       
  2456 
       
  2457     Register state   = c_rarg0;
       
  2458     Register subkeyH = c_rarg1;
       
  2459     Register data    = c_rarg2;
       
  2460     Register blocks  = c_rarg3;
       
  2461 
       
  2462     FloatRegister vzr = v30;
       
  2463     __ eor(vzr, __ T16B, vzr, vzr); // zero register
       
  2464 
       
  2465     __ mov(v26, __ T16B, 1);
       
  2466     __ mov(v27, __ T16B, 63);
       
  2467     __ mov(v28, __ T16B, 62);
       
  2468     __ mov(v29, __ T16B, 57);
       
  2469 
       
  2470     __ ldrq(v6, Address(state));
       
  2471     __ ldrq(v16, Address(subkeyH));
       
  2472 
       
  2473     __ ext(v0, __ T16B, v6, v6, 0x08);
       
  2474     __ ext(v1, __ T16B, v16, v16, 0x08);
       
  2475     __ eor(v16, __ T16B, v16, v1);
       
  2476 
       
  2477     __ bind(L_ghash_loop);
       
  2478 
       
  2479     __ ldrq(v2, Address(__ post(data, 0x10)));
       
  2480     __ rev64(v2, __ T16B, v2); // swap data
       
  2481 
       
  2482     __ ext(v6, __ T16B, v0, v0, 0x08);
       
  2483     __ eor(v6, __ T16B, v6, v2);
       
  2484     __ ext(v2, __ T16B, v6, v6, 0x08);
       
  2485 
       
  2486     __ pmull2(v7, __ T1Q, v2, v1, __ T2D);  // A1*B1
       
  2487     __ eor(v6, __ T16B, v6, v2);
       
  2488     __ pmull(v5,  __ T1Q, v2, v1, __ T1D);  // A0*B0
       
  2489     __ pmull(v20, __ T1Q, v6, v16, __ T1D);  // (A1 + A0)(B1 + B0)
       
  2490 
       
  2491     __ ext(v21, __ T16B, v5, v7, 0x08);
       
  2492     __ eor(v18, __ T16B, v7, v5); // A1*B1 xor A0*B0
       
  2493     __ eor(v20, __ T16B, v20, v21);
       
  2494     __ eor(v20, __ T16B, v20, v18);
       
  2495 
       
  2496     // Registers pair <v7:v5> holds the result of carry-less multiplication
       
  2497     __ ins(v7, __ D, v20, 0, 1);
       
  2498     __ ins(v5, __ D, v20, 1, 0);
       
  2499 
       
  2500     // Result of the multiplication is shifted by one bit position
       
  2501     // [X3:X2:X1:X0] = [X3:X2:X1:X0] << 1
       
  2502     __ ushr(v18, __ T2D, v5, -63 & 63);
       
  2503     __ ins(v25, __ D, v18, 1, 0);
       
  2504     __ ins(v25, __ D, vzr, 0, 0);
       
  2505     __ ushl(v5, __ T2D, v5, v26);
       
  2506     __ orr(v5, __ T16B, v5, v25);
       
  2507 
       
  2508     __ ushr(v19, __ T2D, v7, -63 & 63);
       
  2509     __ ins(v19, __ D, v19, 1, 0);
       
  2510     __ ins(v19, __ D, v18, 0, 1);
       
  2511     __ ushl(v7, __ T2D, v7, v26);
       
  2512     __ orr(v6, __ T16B, v7, v19);
       
  2513 
       
  2514     __ ins(v24, __ D, v5, 0, 1);
       
  2515 
       
  2516     // A = X0 << 63
       
  2517     __ ushl(v21, __ T2D, v5, v27);
       
  2518 
       
  2519     // A = X0 << 62
       
  2520     __ ushl(v22, __ T2D, v5, v28);
       
  2521 
       
  2522     // A = X0 << 57
       
  2523     __ ushl(v23, __ T2D, v5, v29);
       
  2524 
       
  2525     // D = X1^A^B^C
       
  2526     __ eor(v21, __ T16B, v21, v22);
       
  2527     __ eor(v21, __ T16B, v21, v23);
       
  2528     __ eor(v21, __ T16B, v21, v24);
       
  2529     __ ins(v5, __ D, v21, 1, 0);
       
  2530 
       
  2531     // [E1:E0] = [D:X0] >> 1
       
  2532     __ ushr(v20, __ T2D, v5, -1 & 63);
       
  2533     __ ushl(v18, __ T2D, v5, v27);
       
  2534     __ ext(v25, __ T16B, v18, vzr, 0x08);
       
  2535     __ orr(v19, __ T16B, v20, v25);
       
  2536 
       
  2537     __ eor(v7, __ T16B, v5, v19);
       
  2538 
       
  2539     // [F1:F0] = [D:X0] >> 2
       
  2540     __ ushr(v20, __ T2D, v5, -2 & 63);
       
  2541     __ ushl(v18, __ T2D, v5, v28);
       
  2542     __ ins(v25, __ D, v18, 0, 1);
       
  2543     __ orr(v19, __ T16B, v20, v25);
       
  2544 
       
  2545     __ eor(v7, __ T16B, v7, v19);
       
  2546 
       
  2547     // [G1:G0] = [D:X0] >> 7
       
  2548     __ ushr(v20, __ T2D, v5, -7 & 63);
       
  2549     __ ushl(v18, __ T2D, v5, v29);
       
  2550     __ ins(v25, __ D, v18, 0, 1);
       
  2551     __ orr(v19, __ T16B, v20, v25);
       
  2552 
       
  2553     // [H1:H0] = [D^E1^F1^G1:X0^E0^F0^G0]
       
  2554     __ eor(v7, __ T16B, v7, v19);
       
  2555 
       
  2556     // Result = [H1:H0]^[X3:X2]
       
  2557     __ eor(v0, __ T16B, v7, v6);
       
  2558 
       
  2559     __ subs(blocks, blocks, 1);
       
  2560     __ cbnz(blocks, L_ghash_loop);
       
  2561 
       
  2562     __ ext(v1, __ T16B, v0, v0, 0x08);
       
  2563     __ st1(v1, __ T16B, state);
       
  2564     __ ret(lr);
       
  2565 
       
  2566     return start;
       
  2567   }
       
  2568 
  2438   // Continuation point for throwing of implicit exceptions that are
  2569   // Continuation point for throwing of implicit exceptions that are
  2439   // not handled in the current activation. Fabricates an exception
  2570   // not handled in the current activation. Fabricates an exception
  2440   // oop and initiates normal exception dispatching in this
  2571   // oop and initiates normal exception dispatching in this
  2441   // frame. Since we need to preserve callee-saved values (currently
  2572   // frame. Since we need to preserve callee-saved values (currently
  2442   // only for C2, but done for C1 as well) we need a callee-saved oop
  2573   // only for C2, but done for C1 as well) we need a callee-saved oop
  3436       // because it's faster for the sizes of modulus we care about.
  3567       // because it's faster for the sizes of modulus we care about.
  3437       StubRoutines::_montgomerySquare = g.generate_multiply();
  3568       StubRoutines::_montgomerySquare = g.generate_multiply();
  3438     }
  3569     }
  3439 
  3570 
  3440 #ifndef BUILTIN_SIM
  3571 #ifndef BUILTIN_SIM
       
  3572     // generate GHASH intrinsics code
       
  3573     if (UseGHASHIntrinsics) {
       
  3574       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
       
  3575     }
       
  3576 
  3441     if (UseAESIntrinsics) {
  3577     if (UseAESIntrinsics) {
  3442       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
  3578       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
  3443       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
  3579       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
  3444       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
  3580       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
  3445       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
  3581       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();