2433 __ ret(lr); |
2433 __ ret(lr); |
2434 |
2434 |
2435 return start; |
2435 return start; |
2436 } |
2436 } |
2437 |
2437 |
|
2438 /** |
|
2439 * Arguments: |
|
2440 * |
|
2441 * Input: |
|
2442 * c_rarg0 - current state address |
|
2443 * c_rarg1 - H key address |
|
2444 * c_rarg2 - data address |
|
2445 * c_rarg3 - number of blocks |
|
2446 * |
|
2447 * Output: |
|
2448 * Updated state at c_rarg0 |
|
2449 */ |
|
2450 address generate_ghash_processBlocks() { |
|
2451 __ align(CodeEntryAlignment); |
|
2452 Label L_ghash_loop, L_exit; |
|
2453 |
|
2454 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); |
|
2455 address start = __ pc(); |
|
2456 |
|
2457 Register state = c_rarg0; |
|
2458 Register subkeyH = c_rarg1; |
|
2459 Register data = c_rarg2; |
|
2460 Register blocks = c_rarg3; |
|
2461 |
|
2462 FloatRegister vzr = v30; |
|
2463 __ eor(vzr, __ T16B, vzr, vzr); // zero register |
|
2464 |
|
2465 __ mov(v26, __ T16B, 1); |
|
2466 __ mov(v27, __ T16B, 63); |
|
2467 __ mov(v28, __ T16B, 62); |
|
2468 __ mov(v29, __ T16B, 57); |
|
2469 |
|
2470 __ ldrq(v6, Address(state)); |
|
2471 __ ldrq(v16, Address(subkeyH)); |
|
2472 |
|
2473 __ ext(v0, __ T16B, v6, v6, 0x08); |
|
2474 __ ext(v1, __ T16B, v16, v16, 0x08); |
|
2475 __ eor(v16, __ T16B, v16, v1); |
|
2476 |
|
2477 __ bind(L_ghash_loop); |
|
2478 |
|
2479 __ ldrq(v2, Address(__ post(data, 0x10))); |
|
2480 __ rev64(v2, __ T16B, v2); // swap data |
|
2481 |
|
2482 __ ext(v6, __ T16B, v0, v0, 0x08); |
|
2483 __ eor(v6, __ T16B, v6, v2); |
|
2484 __ ext(v2, __ T16B, v6, v6, 0x08); |
|
2485 |
|
2486 __ pmull2(v7, __ T1Q, v2, v1, __ T2D); // A1*B1 |
|
2487 __ eor(v6, __ T16B, v6, v2); |
|
2488 __ pmull(v5, __ T1Q, v2, v1, __ T1D); // A0*B0 |
|
2489 __ pmull(v20, __ T1Q, v6, v16, __ T1D); // (A1 + A0)(B1 + B0) |
|
2490 |
|
2491 __ ext(v21, __ T16B, v5, v7, 0x08); |
|
2492 __ eor(v18, __ T16B, v7, v5); // A1*B1 xor A0*B0 |
|
2493 __ eor(v20, __ T16B, v20, v21); |
|
2494 __ eor(v20, __ T16B, v20, v18); |
|
2495 |
|
2496 // Registers pair <v7:v5> holds the result of carry-less multiplication |
|
2497 __ ins(v7, __ D, v20, 0, 1); |
|
2498 __ ins(v5, __ D, v20, 1, 0); |
|
2499 |
|
2500 // Result of the multiplication is shifted by one bit position |
|
2501 // [X3:X2:X1:X0] = [X3:X2:X1:X0] << 1 |
|
2502 __ ushr(v18, __ T2D, v5, -63 & 63); |
|
2503 __ ins(v25, __ D, v18, 1, 0); |
|
2504 __ ins(v25, __ D, vzr, 0, 0); |
|
2505 __ ushl(v5, __ T2D, v5, v26); |
|
2506 __ orr(v5, __ T16B, v5, v25); |
|
2507 |
|
2508 __ ushr(v19, __ T2D, v7, -63 & 63); |
|
2509 __ ins(v19, __ D, v19, 1, 0); |
|
2510 __ ins(v19, __ D, v18, 0, 1); |
|
2511 __ ushl(v7, __ T2D, v7, v26); |
|
2512 __ orr(v6, __ T16B, v7, v19); |
|
2513 |
|
2514 __ ins(v24, __ D, v5, 0, 1); |
|
2515 |
|
2516 // A = X0 << 63 |
|
2517 __ ushl(v21, __ T2D, v5, v27); |
|
2518 |
|
2519 // A = X0 << 62 |
|
2520 __ ushl(v22, __ T2D, v5, v28); |
|
2521 |
|
2522 // A = X0 << 57 |
|
2523 __ ushl(v23, __ T2D, v5, v29); |
|
2524 |
|
2525 // D = X1^A^B^C |
|
2526 __ eor(v21, __ T16B, v21, v22); |
|
2527 __ eor(v21, __ T16B, v21, v23); |
|
2528 __ eor(v21, __ T16B, v21, v24); |
|
2529 __ ins(v5, __ D, v21, 1, 0); |
|
2530 |
|
2531 // [E1:E0] = [D:X0] >> 1 |
|
2532 __ ushr(v20, __ T2D, v5, -1 & 63); |
|
2533 __ ushl(v18, __ T2D, v5, v27); |
|
2534 __ ext(v25, __ T16B, v18, vzr, 0x08); |
|
2535 __ orr(v19, __ T16B, v20, v25); |
|
2536 |
|
2537 __ eor(v7, __ T16B, v5, v19); |
|
2538 |
|
2539 // [F1:F0] = [D:X0] >> 2 |
|
2540 __ ushr(v20, __ T2D, v5, -2 & 63); |
|
2541 __ ushl(v18, __ T2D, v5, v28); |
|
2542 __ ins(v25, __ D, v18, 0, 1); |
|
2543 __ orr(v19, __ T16B, v20, v25); |
|
2544 |
|
2545 __ eor(v7, __ T16B, v7, v19); |
|
2546 |
|
2547 // [G1:G0] = [D:X0] >> 7 |
|
2548 __ ushr(v20, __ T2D, v5, -7 & 63); |
|
2549 __ ushl(v18, __ T2D, v5, v29); |
|
2550 __ ins(v25, __ D, v18, 0, 1); |
|
2551 __ orr(v19, __ T16B, v20, v25); |
|
2552 |
|
2553 // [H1:H0] = [D^E1^F1^G1:X0^E0^F0^G0] |
|
2554 __ eor(v7, __ T16B, v7, v19); |
|
2555 |
|
2556 // Result = [H1:H0]^[X3:X2] |
|
2557 __ eor(v0, __ T16B, v7, v6); |
|
2558 |
|
2559 __ subs(blocks, blocks, 1); |
|
2560 __ cbnz(blocks, L_ghash_loop); |
|
2561 |
|
2562 __ ext(v1, __ T16B, v0, v0, 0x08); |
|
2563 __ st1(v1, __ T16B, state); |
|
2564 __ ret(lr); |
|
2565 |
|
2566 return start; |
|
2567 } |
|
2568 |
2438 // Continuation point for throwing of implicit exceptions that are |
2569 // Continuation point for throwing of implicit exceptions that are |
2439 // not handled in the current activation. Fabricates an exception |
2570 // not handled in the current activation. Fabricates an exception |
2440 // oop and initiates normal exception dispatching in this |
2571 // oop and initiates normal exception dispatching in this |
2441 // frame. Since we need to preserve callee-saved values (currently |
2572 // frame. Since we need to preserve callee-saved values (currently |
2442 // only for C2, but done for C1 as well) we need a callee-saved oop |
2573 // only for C2, but done for C1 as well) we need a callee-saved oop |