2530 __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256); |
2546 __ vxorpd($dst$$XMMRegister, $dst$$XMMRegister, $dst$$XMMRegister, vector256); |
2531 %} |
2547 %} |
2532 ins_pipe( fpu_reg_reg ); |
2548 ins_pipe( fpu_reg_reg ); |
2533 %} |
2549 %} |
2534 |
2550 |
|
2551 // ====================REDUCTION ARITHMETIC======================================= |
|
2552 |
|
2553 instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{ |
|
2554 predicate(UseSSE > 2 && UseAVX == 0); |
|
2555 match(Set dst (AddReductionVI src1 src2)); |
|
2556 effect(TEMP tmp2, TEMP tmp); |
|
2557 format %{ "movdqu $tmp2,$src2\n\t" |
|
2558 "phaddd $tmp2,$tmp2\n\t" |
|
2559 "movd $tmp,$src1\n\t" |
|
2560 "paddd $tmp,$tmp2\n\t" |
|
2561 "movd $dst,$tmp\t! add reduction2I" %} |
|
2562 ins_encode %{ |
|
2563 __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister); |
|
2564 __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister); |
|
2565 __ movdl($tmp$$XMMRegister, $src1$$Register); |
|
2566 __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister); |
|
2567 __ movdl($dst$$Register, $tmp$$XMMRegister); |
|
2568 %} |
|
2569 ins_pipe( pipe_slow ); |
|
2570 %} |
|
2571 |
|
2572 instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{ |
|
2573 predicate(UseAVX > 0); |
|
2574 match(Set dst (AddReductionVI src1 src2)); |
|
2575 effect(TEMP tmp, TEMP tmp2); |
|
2576 format %{ "vphaddd $tmp,$src2,$src2\n\t" |
|
2577 "movd $tmp2,$src1\n\t" |
|
2578 "vpaddd $tmp2,$tmp2,$tmp\n\t" |
|
2579 "movd $dst,$tmp2\t! add reduction2I" %} |
|
2580 ins_encode %{ |
|
2581 __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, false); |
|
2582 __ movdl($tmp2$$XMMRegister, $src1$$Register); |
|
2583 __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false); |
|
2584 __ movdl($dst$$Register, $tmp2$$XMMRegister); |
|
2585 %} |
|
2586 ins_pipe( pipe_slow ); |
|
2587 %} |
|
2588 |
|
2589 instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{ |
|
2590 predicate(UseSSE > 2 && UseAVX == 0); |
|
2591 match(Set dst (AddReductionVI src1 src2)); |
|
2592 effect(TEMP tmp2, TEMP tmp); |
|
2593 format %{ "movdqu $tmp2,$src2\n\t" |
|
2594 "phaddd $tmp2,$tmp2\n\t" |
|
2595 "phaddd $tmp2,$tmp2\n\t" |
|
2596 "movd $tmp,$src1\n\t" |
|
2597 "paddd $tmp,$tmp2\n\t" |
|
2598 "movd $dst,$tmp\t! add reduction4I" %} |
|
2599 ins_encode %{ |
|
2600 __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister); |
|
2601 __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister); |
|
2602 __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister); |
|
2603 __ movdl($tmp$$XMMRegister, $src1$$Register); |
|
2604 __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister); |
|
2605 __ movdl($dst$$Register, $tmp$$XMMRegister); |
|
2606 %} |
|
2607 ins_pipe( pipe_slow ); |
|
2608 %} |
|
2609 |
|
2610 instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{ |
|
2611 predicate(UseAVX > 0); |
|
2612 match(Set dst (AddReductionVI src1 src2)); |
|
2613 effect(TEMP tmp, TEMP tmp2); |
|
2614 format %{ "vphaddd $tmp,$src2,$src2\n\t" |
|
2615 "vphaddd $tmp,$tmp,$tmp2\n\t" |
|
2616 "movd $tmp2,$src1\n\t" |
|
2617 "vpaddd $tmp2,$tmp2,$tmp\n\t" |
|
2618 "movd $dst,$tmp2\t! add reduction4I" %} |
|
2619 ins_encode %{ |
|
2620 __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, false); |
|
2621 __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); |
|
2622 __ movdl($tmp2$$XMMRegister, $src1$$Register); |
|
2623 __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false); |
|
2624 __ movdl($dst$$Register, $tmp2$$XMMRegister); |
|
2625 %} |
|
2626 ins_pipe( pipe_slow ); |
|
2627 %} |
|
2628 |
|
2629 instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{ |
|
2630 predicate(UseAVX > 0); |
|
2631 match(Set dst (AddReductionVI src1 src2)); |
|
2632 effect(TEMP tmp, TEMP tmp2); |
|
2633 format %{ "vphaddd $tmp,$src2,$src2\n\t" |
|
2634 "vphaddd $tmp,$tmp,$tmp2\n\t" |
|
2635 "vextractf128 $tmp2,$tmp\n\t" |
|
2636 "vpaddd $tmp,$tmp,$tmp2\n\t" |
|
2637 "movd $tmp2,$src1\n\t" |
|
2638 "vpaddd $tmp2,$tmp2,$tmp\n\t" |
|
2639 "movd $dst,$tmp2\t! add reduction8I" %} |
|
2640 ins_encode %{ |
|
2641 __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, true); |
|
2642 __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, true); |
|
2643 __ vextractf128h($tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
2644 __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); |
|
2645 __ movdl($tmp2$$XMMRegister, $src1$$Register); |
|
2646 __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false); |
|
2647 __ movdl($dst$$Register, $tmp2$$XMMRegister); |
|
2648 %} |
|
2649 ins_pipe( pipe_slow ); |
|
2650 %} |
|
2651 |
|
2652 instruct rsadd2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{ |
|
2653 predicate(UseSSE >= 1 && UseAVX == 0); |
|
2654 match(Set dst (AddReductionVF src1 src2)); |
|
2655 effect(TEMP tmp, TEMP tmp2); |
|
2656 format %{ "movdqu $tmp,$src1\n\t" |
|
2657 "addss $tmp,$src2\n\t" |
|
2658 "pshufd $tmp2,$src2,0x01\n\t" |
|
2659 "addss $tmp,$tmp2\n\t" |
|
2660 "movdqu $dst,$tmp\t! add reduction2F" %} |
|
2661 ins_encode %{ |
|
2662 __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister); |
|
2663 __ addss($tmp$$XMMRegister, $src2$$XMMRegister); |
|
2664 __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01); |
|
2665 __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister); |
|
2666 __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister); |
|
2667 %} |
|
2668 ins_pipe( pipe_slow ); |
|
2669 %} |
|
2670 |
|
2671 instruct rvadd2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{ |
|
2672 predicate(UseAVX > 0); |
|
2673 match(Set dst (AddReductionVF src1 src2)); |
|
2674 effect(TEMP tmp2, TEMP tmp); |
|
2675 format %{ "vaddss $tmp2,$src1,$src2\n\t" |
|
2676 "pshufd $tmp,$src2,0x01\n\t" |
|
2677 "vaddss $dst,$tmp2,$tmp\t! add reduction2F" %} |
|
2678 ins_encode %{ |
|
2679 __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); |
|
2680 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01); |
|
2681 __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
2682 %} |
|
2683 ins_pipe( pipe_slow ); |
|
2684 %} |
|
2685 |
|
2686 instruct rsadd4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{ |
|
2687 predicate(UseSSE >= 1 && UseAVX == 0); |
|
2688 match(Set dst (AddReductionVF src1 src2)); |
|
2689 effect(TEMP tmp, TEMP tmp2); |
|
2690 format %{ "movdqu $tmp,$src1\n\t" |
|
2691 "addss $tmp,$src2\n\t" |
|
2692 "pshufd $tmp2,$src2,0x01\n\t" |
|
2693 "addss $tmp,$tmp2\n\t" |
|
2694 "pshufd $tmp2,$src2,0x02\n\t" |
|
2695 "addss $tmp,$tmp2\n\t" |
|
2696 "pshufd $tmp2,$src2,0x03\n\t" |
|
2697 "addss $tmp,$tmp2\n\t" |
|
2698 "movdqu $dst,$tmp\t! add reduction4F" %} |
|
2699 ins_encode %{ |
|
2700 __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister); |
|
2701 __ addss($tmp$$XMMRegister, $src2$$XMMRegister); |
|
2702 __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01); |
|
2703 __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister); |
|
2704 __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x02); |
|
2705 __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister); |
|
2706 __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x03); |
|
2707 __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister); |
|
2708 __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister); |
|
2709 %} |
|
2710 ins_pipe( pipe_slow ); |
|
2711 %} |
|
2712 |
|
2713 instruct rvadd4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{ |
|
2714 predicate(UseAVX > 0); |
|
2715 match(Set dst (AddReductionVF src1 src2)); |
|
2716 effect(TEMP tmp, TEMP tmp2); |
|
2717 format %{ "vaddss $tmp2,$src1,$src2\n\t" |
|
2718 "pshufd $tmp,$src2,0x01\n\t" |
|
2719 "vaddss $tmp2,$tmp2,$tmp\n\t" |
|
2720 "pshufd $tmp,$src2,0x02\n\t" |
|
2721 "vaddss $tmp2,$tmp2,$tmp\n\t" |
|
2722 "pshufd $tmp,$src2,0x03\n\t" |
|
2723 "vaddss $dst,$tmp2,$tmp\t! add reduction4F" %} |
|
2724 ins_encode %{ |
|
2725 __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); |
|
2726 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01); |
|
2727 __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
2728 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02); |
|
2729 __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
2730 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03); |
|
2731 __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
2732 %} |
|
2733 ins_pipe( pipe_slow ); |
|
2734 %} |
|
2735 |
|
2736 instruct radd8F_reduction_reg(regF dst, regF src1, vecY src2, regF tmp, regF tmp2, regF tmp3) %{ |
|
2737 predicate(UseAVX > 0); |
|
2738 match(Set dst (AddReductionVF src1 src2)); |
|
2739 effect(TEMP tmp, TEMP tmp2, TEMP tmp3); |
|
2740 format %{ "vaddss $tmp2,$src1,$src2\n\t" |
|
2741 "pshufd $tmp,$src2,0x01\n\t" |
|
2742 "vaddss $tmp2,$tmp2,$tmp\n\t" |
|
2743 "pshufd $tmp,$src2,0x02\n\t" |
|
2744 "vaddss $tmp2,$tmp2,$tmp\n\t" |
|
2745 "pshufd $tmp,$src2,0x03\n\t" |
|
2746 "vaddss $tmp2,$tmp2,$tmp\n\t" |
|
2747 "vextractf128 $tmp3,$src2\n\t" |
|
2748 "vaddss $tmp2,$tmp2,$tmp3\n\t" |
|
2749 "pshufd $tmp,$tmp3,0x01\n\t" |
|
2750 "vaddss $tmp2,$tmp2,$tmp\n\t" |
|
2751 "pshufd $tmp,$tmp3,0x02\n\t" |
|
2752 "vaddss $tmp2,$tmp2,$tmp\n\t" |
|
2753 "pshufd $tmp,$tmp3,0x03\n\t" |
|
2754 "vaddss $dst,$tmp2,$tmp\t! add reduction8F" %} |
|
2755 ins_encode %{ |
|
2756 __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); |
|
2757 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01); |
|
2758 __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
2759 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02); |
|
2760 __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
2761 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03); |
|
2762 __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
2763 __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister); |
|
2764 __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); |
|
2765 __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01); |
|
2766 __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
2767 __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02); |
|
2768 __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
2769 __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03); |
|
2770 __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
2771 %} |
|
2772 ins_pipe( pipe_slow ); |
|
2773 %} |
|
2774 |
|
2775 instruct rsadd2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp) %{ |
|
2776 predicate(UseSSE >= 1 && UseAVX == 0); |
|
2777 match(Set dst (AddReductionVD src1 src2)); |
|
2778 effect(TEMP tmp, TEMP dst); |
|
2779 format %{ "movdqu $tmp,$src1\n\t" |
|
2780 "addsd $tmp,$src2\n\t" |
|
2781 "pshufd $dst,$src2,0xE\n\t" |
|
2782 "addsd $dst,$tmp\t! add reduction2D" %} |
|
2783 ins_encode %{ |
|
2784 __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister); |
|
2785 __ addsd($tmp$$XMMRegister, $src2$$XMMRegister); |
|
2786 __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xE); |
|
2787 __ addsd($dst$$XMMRegister, $tmp$$XMMRegister); |
|
2788 %} |
|
2789 ins_pipe( pipe_slow ); |
|
2790 %} |
|
2791 |
|
2792 instruct rvadd2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp, regD tmp2) %{ |
|
2793 predicate(UseAVX > 0); |
|
2794 match(Set dst (AddReductionVD src1 src2)); |
|
2795 effect(TEMP tmp, TEMP tmp2); |
|
2796 format %{ "vaddsd $tmp2,$src1,$src2\n\t" |
|
2797 "pshufd $tmp,$src2,0xE\n\t" |
|
2798 "vaddsd $dst,$tmp2,$tmp\t! add reduction2D" %} |
|
2799 ins_encode %{ |
|
2800 __ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); |
|
2801 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE); |
|
2802 __ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
2803 %} |
|
2804 ins_pipe( pipe_slow ); |
|
2805 %} |
|
2806 |
|
2807 instruct rvadd4D_reduction_reg(regD dst, regD src1, vecY src2, regD tmp, regD tmp2, regD tmp3) %{ |
|
2808 predicate(UseAVX > 0); |
|
2809 match(Set dst (AddReductionVD src1 src2)); |
|
2810 effect(TEMP tmp, TEMP tmp2, TEMP tmp3); |
|
2811 format %{ "vaddsd $tmp2,$src1,$src2\n\t" |
|
2812 "pshufd $tmp,$src2,0xE\n\t" |
|
2813 "vaddsd $tmp2,$tmp2,$tmp\n\t" |
|
2814 "vextractf128 $tmp3,$src2\n\t" |
|
2815 "vaddsd $tmp2,$tmp2,$tmp3\n\t" |
|
2816 "pshufd $tmp,$tmp3,0xE\n\t" |
|
2817 "vaddsd $dst,$tmp2,$tmp\t! add reduction4D" %} |
|
2818 ins_encode %{ |
|
2819 __ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); |
|
2820 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE); |
|
2821 __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
2822 __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister); |
|
2823 __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); |
|
2824 __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE); |
|
2825 __ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
2826 %} |
|
2827 ins_pipe( pipe_slow ); |
|
2828 %} |
|
2829 |
|
2830 instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{ |
|
2831 predicate(UseSSE > 3 && UseAVX == 0); |
|
2832 match(Set dst (MulReductionVI src1 src2)); |
|
2833 effect(TEMP tmp, TEMP tmp2); |
|
2834 format %{ "pshufd $tmp2,$src2,0x1\n\t" |
|
2835 "pmulld $tmp2,$src2\n\t" |
|
2836 "movd $tmp,$src1\n\t" |
|
2837 "pmulld $tmp2,$tmp\n\t" |
|
2838 "movd $dst,$tmp2\t! mul reduction2I" %} |
|
2839 ins_encode %{ |
|
2840 __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1); |
|
2841 __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister); |
|
2842 __ movdl($tmp$$XMMRegister, $src1$$Register); |
|
2843 __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
2844 __ movdl($dst$$Register, $tmp2$$XMMRegister); |
|
2845 %} |
|
2846 ins_pipe( pipe_slow ); |
|
2847 %} |
|
2848 |
|
2849 instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{ |
|
2850 predicate(UseAVX > 0); |
|
2851 match(Set dst (MulReductionVI src1 src2)); |
|
2852 effect(TEMP tmp, TEMP tmp2); |
|
2853 format %{ "pshufd $tmp2,$src2,0x1\n\t" |
|
2854 "vpmulld $tmp,$src2,$tmp2\n\t" |
|
2855 "movd $tmp2,$src1\n\t" |
|
2856 "vpmulld $tmp2,$tmp,$tmp2\n\t" |
|
2857 "movd $dst,$tmp2\t! mul reduction2I" %} |
|
2858 ins_encode %{ |
|
2859 __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1); |
|
2860 __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, false); |
|
2861 __ movdl($tmp2$$XMMRegister, $src1$$Register); |
|
2862 __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); |
|
2863 __ movdl($dst$$Register, $tmp2$$XMMRegister); |
|
2864 %} |
|
2865 ins_pipe( pipe_slow ); |
|
2866 %} |
|
2867 |
|
2868 instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{ |
|
2869 predicate(UseSSE > 3 && UseAVX == 0); |
|
2870 match(Set dst (MulReductionVI src1 src2)); |
|
2871 effect(TEMP tmp, TEMP tmp2); |
|
2872 format %{ "pshufd $tmp2,$src2,0xE\n\t" |
|
2873 "pmulld $tmp2,$src2\n\t" |
|
2874 "pshufd $tmp,$tmp2,0x1\n\t" |
|
2875 "pmulld $tmp2,$tmp\n\t" |
|
2876 "movd $tmp,$src1\n\t" |
|
2877 "pmulld $tmp2,$tmp\n\t" |
|
2878 "movd $dst,$tmp2\t! mul reduction4I" %} |
|
2879 ins_encode %{ |
|
2880 __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE); |
|
2881 __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister); |
|
2882 __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1); |
|
2883 __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
2884 __ movdl($tmp$$XMMRegister, $src1$$Register); |
|
2885 __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
2886 __ movdl($dst$$Register, $tmp2$$XMMRegister); |
|
2887 %} |
|
2888 ins_pipe( pipe_slow ); |
|
2889 %} |
|
2890 |
|
2891 instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{ |
|
2892 predicate(UseAVX > 0); |
|
2893 match(Set dst (MulReductionVI src1 src2)); |
|
2894 effect(TEMP tmp, TEMP tmp2); |
|
2895 format %{ "pshufd $tmp2,$src2,0xE\n\t" |
|
2896 "vpmulld $tmp,$src2,$tmp2\n\t" |
|
2897 "pshufd $tmp2,$tmp,0x1\n\t" |
|
2898 "vpmulld $tmp,$tmp,$tmp2\n\t" |
|
2899 "movd $tmp2,$src1\n\t" |
|
2900 "vpmulld $tmp2,$tmp,$tmp2\n\t" |
|
2901 "movd $dst,$tmp2\t! mul reduction4I" %} |
|
2902 ins_encode %{ |
|
2903 __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE); |
|
2904 __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, false); |
|
2905 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1); |
|
2906 __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); |
|
2907 __ movdl($tmp2$$XMMRegister, $src1$$Register); |
|
2908 __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); |
|
2909 __ movdl($dst$$Register, $tmp2$$XMMRegister); |
|
2910 %} |
|
2911 ins_pipe( pipe_slow ); |
|
2912 %} |
|
2913 |
|
2914 instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{ |
|
2915 predicate(UseAVX > 0); |
|
2916 match(Set dst (MulReductionVI src1 src2)); |
|
2917 effect(TEMP tmp, TEMP tmp2); |
|
2918 format %{ "vextractf128 $tmp,$src2\n\t" |
|
2919 "vpmulld $tmp,$tmp,$src2\n\t" |
|
2920 "pshufd $tmp2,$tmp,0xE\n\t" |
|
2921 "vpmulld $tmp,$tmp,$tmp2\n\t" |
|
2922 "pshufd $tmp2,$tmp,0x1\n\t" |
|
2923 "vpmulld $tmp,$tmp,$tmp2\n\t" |
|
2924 "movd $tmp2,$src1\n\t" |
|
2925 "vpmulld $tmp2,$tmp,$tmp2\n\t" |
|
2926 "movd $dst,$tmp2\t! mul reduction8I" %} |
|
2927 ins_encode %{ |
|
2928 __ vextractf128h($tmp$$XMMRegister, $src2$$XMMRegister); |
|
2929 __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, false); |
|
2930 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE); |
|
2931 __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); |
|
2932 __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1); |
|
2933 __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); |
|
2934 __ movdl($tmp2$$XMMRegister, $src1$$Register); |
|
2935 __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false); |
|
2936 __ movdl($dst$$Register, $tmp2$$XMMRegister); |
|
2937 %} |
|
2938 ins_pipe( pipe_slow ); |
|
2939 %} |
|
2940 |
|
2941 instruct rsmul2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{ |
|
2942 predicate(UseSSE >= 1 && UseAVX == 0); |
|
2943 match(Set dst (MulReductionVF src1 src2)); |
|
2944 effect(TEMP tmp, TEMP tmp2); |
|
2945 format %{ "movdqu $tmp,$src1\n\t" |
|
2946 "mulss $tmp,$src2\n\t" |
|
2947 "pshufd $tmp2,$src2,0x01\n\t" |
|
2948 "mulss $tmp,$tmp2\n\t" |
|
2949 "movdqu $dst,$tmp\t! add reduction2F" %} |
|
2950 ins_encode %{ |
|
2951 __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister); |
|
2952 __ mulss($tmp$$XMMRegister, $src2$$XMMRegister); |
|
2953 __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01); |
|
2954 __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister); |
|
2955 __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister); |
|
2956 %} |
|
2957 ins_pipe( pipe_slow ); |
|
2958 %} |
|
2959 |
|
2960 instruct rvmul2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{ |
|
2961 predicate(UseAVX > 0); |
|
2962 match(Set dst (MulReductionVF src1 src2)); |
|
2963 effect(TEMP tmp, TEMP tmp2); |
|
2964 format %{ "vmulss $tmp2,$src1,$src2\n\t" |
|
2965 "pshufd $tmp,$src2,0x01\n\t" |
|
2966 "vmulss $dst,$tmp2,$tmp\t! add reduction2F" %} |
|
2967 ins_encode %{ |
|
2968 __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); |
|
2969 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01); |
|
2970 __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
2971 %} |
|
2972 ins_pipe( pipe_slow ); |
|
2973 %} |
|
2974 |
|
2975 instruct rsmul4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{ |
|
2976 predicate(UseSSE >= 1 && UseAVX == 0); |
|
2977 match(Set dst (MulReductionVF src1 src2)); |
|
2978 effect(TEMP tmp, TEMP tmp2); |
|
2979 format %{ "movdqu $tmp,$src1\n\t" |
|
2980 "mulss $tmp,$src2\n\t" |
|
2981 "pshufd $tmp2,$src2,0x01\n\t" |
|
2982 "mulss $tmp,$tmp2\n\t" |
|
2983 "pshufd $tmp2,$src2,0x02\n\t" |
|
2984 "mulss $tmp,$tmp2\n\t" |
|
2985 "pshufd $tmp2,$src2,0x03\n\t" |
|
2986 "mulss $tmp,$tmp2\n\t" |
|
2987 "movdqu $dst,$tmp\t! add reduction4F" %} |
|
2988 ins_encode %{ |
|
2989 __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister); |
|
2990 __ mulss($tmp$$XMMRegister, $src2$$XMMRegister); |
|
2991 __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01); |
|
2992 __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister); |
|
2993 __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x02); |
|
2994 __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister); |
|
2995 __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x03); |
|
2996 __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister); |
|
2997 __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister); |
|
2998 %} |
|
2999 ins_pipe( pipe_slow ); |
|
3000 %} |
|
3001 |
|
3002 instruct rvmul4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{ |
|
3003 predicate(UseAVX > 0); |
|
3004 match(Set dst (MulReductionVF src1 src2)); |
|
3005 effect(TEMP tmp, TEMP tmp2); |
|
3006 format %{ "vmulss $tmp2,$src1,$src2\n\t" |
|
3007 "pshufd $tmp,$src2,0x01\n\t" |
|
3008 "vmulss $tmp2,$tmp2,$tmp\n\t" |
|
3009 "pshufd $tmp,$src2,0x02\n\t" |
|
3010 "vmulss $tmp2,$tmp2,$tmp\n\t" |
|
3011 "pshufd $tmp,$src2,0x03\n\t" |
|
3012 "vmulss $dst,$tmp2,$tmp\t! add reduction4F" %} |
|
3013 ins_encode %{ |
|
3014 __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); |
|
3015 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01); |
|
3016 __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
3017 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02); |
|
3018 __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
3019 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03); |
|
3020 __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
3021 %} |
|
3022 ins_pipe( pipe_slow ); |
|
3023 %} |
|
3024 |
|
3025 instruct rvmul8F_reduction_reg(regF dst, regF src1, vecY src2, regF tmp, regF tmp2, regF tmp3) %{ |
|
3026 predicate(UseAVX > 0); |
|
3027 match(Set dst (MulReductionVF src1 src2)); |
|
3028 effect(TEMP tmp, TEMP tmp2, TEMP tmp3); |
|
3029 format %{ "vmulss $tmp2,$src1,$src2\n\t" |
|
3030 "pshufd $tmp,$src2,0x01\n\t" |
|
3031 "vmulss $tmp2,$tmp2,$tmp\n\t" |
|
3032 "pshufd $tmp,$src2,0x02\n\t" |
|
3033 "vmulss $tmp2,$tmp2,$tmp\n\t" |
|
3034 "pshufd $tmp,$src2,0x03\n\t" |
|
3035 "vmulss $tmp2,$tmp2,$tmp\n\t" |
|
3036 "vextractf128 $tmp3,$src2\n\t" |
|
3037 "vmulss $tmp2,$tmp2,$tmp3\n\t" |
|
3038 "pshufd $tmp,$tmp3,0x01\n\t" |
|
3039 "vmulss $tmp2,$tmp2,$tmp\n\t" |
|
3040 "pshufd $tmp,$tmp3,0x02\n\t" |
|
3041 "vmulss $tmp2,$tmp2,$tmp\n\t" |
|
3042 "pshufd $tmp,$tmp3,0x03\n\t" |
|
3043 "vmulss $dst,$tmp2,$tmp\t! mul reduction8F" %} |
|
3044 ins_encode %{ |
|
3045 __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); |
|
3046 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01); |
|
3047 __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
3048 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02); |
|
3049 __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
3050 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03); |
|
3051 __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
3052 __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister); |
|
3053 __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); |
|
3054 __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01); |
|
3055 __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
3056 __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02); |
|
3057 __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
3058 __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03); |
|
3059 __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
3060 %} |
|
3061 ins_pipe( pipe_slow ); |
|
3062 %} |
|
3063 |
|
3064 instruct rsmul2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp) %{ |
|
3065 predicate(UseSSE >= 1 && UseAVX == 0); |
|
3066 match(Set dst (MulReductionVD src1 src2)); |
|
3067 effect(TEMP tmp, TEMP dst); |
|
3068 format %{ "movdqu $tmp,$src1\n\t" |
|
3069 "mulsd $tmp,$src2\n\t" |
|
3070 "pshufd $dst,$src2,0xE\n\t" |
|
3071 "mulsd $dst,$tmp\t! add reduction2D" %} |
|
3072 ins_encode %{ |
|
3073 __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister); |
|
3074 __ mulsd($tmp$$XMMRegister, $src2$$XMMRegister); |
|
3075 __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xE); |
|
3076 __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister); |
|
3077 %} |
|
3078 ins_pipe( pipe_slow ); |
|
3079 %} |
|
3080 |
|
3081 instruct rvmul2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp, regD tmp2) %{ |
|
3082 predicate(UseAVX > 0); |
|
3083 match(Set dst (MulReductionVD src1 src2)); |
|
3084 effect(TEMP tmp, TEMP tmp2); |
|
3085 format %{ "vmulsd $tmp2,$src1,$src2\n\t" |
|
3086 "pshufd $tmp,$src2,0xE\n\t" |
|
3087 "vmulsd $dst,$tmp2,$tmp\t! mul reduction2D" %} |
|
3088 ins_encode %{ |
|
3089 __ vmulsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); |
|
3090 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE); |
|
3091 __ vmulsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
3092 %} |
|
3093 ins_pipe( pipe_slow ); |
|
3094 %} |
|
3095 |
|
3096 instruct rvmul4D_reduction_reg(regD dst, regD src1, vecY src2, regD tmp, regD tmp2, regD tmp3) %{ |
|
3097 predicate(UseAVX > 0); |
|
3098 match(Set dst (MulReductionVD src1 src2)); |
|
3099 effect(TEMP tmp, TEMP tmp2, TEMP tmp3); |
|
3100 format %{ "vmulsd $tmp2,$src1,$src2\n\t" |
|
3101 "pshufd $tmp,$src2,0xE\n\t" |
|
3102 "vmulsd $tmp2,$tmp2,$tmp\n\t" |
|
3103 "vextractf128 $tmp3,$src2\n\t" |
|
3104 "vmulsd $tmp2,$tmp2,$tmp3\n\t" |
|
3105 "pshufd $tmp,$tmp3,0xE\n\t" |
|
3106 "vmulsd $dst,$tmp2,$tmp\t! mul reduction4D" %} |
|
3107 ins_encode %{ |
|
3108 __ vmulsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister); |
|
3109 __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE); |
|
3110 __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
3111 __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister); |
|
3112 __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister); |
|
3113 __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE); |
|
3114 __ vmulsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister); |
|
3115 %} |
|
3116 ins_pipe( pipe_slow ); |
|
3117 %} |
|
3118 |
2535 // ====================VECTOR ARITHMETIC======================================= |
3119 // ====================VECTOR ARITHMETIC======================================= |
2536 |
3120 |
2537 // --------------------------------- ADD -------------------------------------- |
3121 // --------------------------------- ADD -------------------------------------- |
2538 |
3122 |
2539 // Bytes vector add |
3123 // Bytes vector add |