hotspot/src/cpu/x86/vm/stubRoutines_x86.cpp
author goetz
Mon, 26 Oct 2015 16:21:37 +0100
changeset 33794 41ef3dc95179
parent 33066 d98eab8215c4
child 35154 a9b3c1984a01
permissions -rw-r--r--
8140482: Various minor code improvements (runtime) Reviewed-by: dholmes, coleenp, sspitsyn, dsamersoff

/*
 * Copyright (c) 2013, 2015, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */

#include "precompiled.hpp"
#include "runtime/deoptimization.hpp"
#include "runtime/frame.inline.hpp"
#include "runtime/stubRoutines.hpp"
#include "runtime/thread.inline.hpp"
#include "crc32c.h"

// Implementation of the platform-specific part of StubRoutines - for
// a description of how to extend it, see the stubRoutines.hpp file.

address StubRoutines::x86::_verify_mxcsr_entry = NULL;
address StubRoutines::x86::_key_shuffle_mask_addr = NULL;
address StubRoutines::x86::_ghash_long_swap_mask_addr = NULL;
address StubRoutines::x86::_ghash_byte_swap_mask_addr = NULL;

uint64_t StubRoutines::x86::_crc_by128_masks[] =
{
  /* The fields in this structure are arranged so that they can be
   * picked up two at a time with 128-bit loads.
   *
   * Because of flipped bit order for this CRC polynomials
   * the constant for X**N is left-shifted by 1.  This is because
   * a 64 x 64 polynomial multiply produces a 127-bit result
   * but the highest term is always aligned to bit 0 in the container.
   * Pre-shifting by one fixes this, at the cost of potentially making
   * the 32-bit constant no longer fit in a 32-bit container (thus the
   * use of uint64_t, though this is also the size used by the carry-
   * less multiply instruction.
   *
   * In addition, the flipped bit order and highest-term-at-least-bit
   * multiply changes the constants used.  The 96-bit result will be
   * aligned to the high-term end of the target 128-bit container,
   * not the low-term end; that is, instead of a 512-bit or 576-bit fold,
   * instead it is a 480 (=512-32) or 544 (=512+64-32) bit fold.
   *
   * This cause additional problems in the 128-to-64-bit reduction; see the
   * code for details.  By storing a mask in the otherwise unused half of
   * a 128-bit constant, bits can be cleared before multiplication without
   * storing and reloading.  Note that staying on a 128-bit datapath means
   * that some data is uselessly stored and some unused data is intersected
   * with an irrelevant constant.
   */

  ((uint64_t) 0xffffffffUL),     /* low  of K_M_64    */
  ((uint64_t) 0xb1e6b092U << 1), /* high of K_M_64    */
  ((uint64_t) 0xba8ccbe8U << 1), /* low  of K_160_96  */
  ((uint64_t) 0x6655004fU << 1), /* high of K_160_96  */
  ((uint64_t) 0xaa2215eaU << 1), /* low  of K_544_480 */
  ((uint64_t) 0xe3720acbU << 1)  /* high of K_544_480 */
};

/**
 *  crc_table[] from jdk/src/share/native/java/util/zip/zlib-1.2.5/crc32.h
 */
juint StubRoutines::x86::_crc_table[] =
{
    0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
    0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
    0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
    0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
    0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
    0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
    0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
    0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
    0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
    0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
    0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
    0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
    0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
    0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
    0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
    0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
    0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
    0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
    0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
    0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
    0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
    0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
    0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
    0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
    0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
    0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
    0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
    0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
    0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
    0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
    0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
    0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
    0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
    0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
    0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
    0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
    0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
    0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
    0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
    0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
    0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
    0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
    0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
    0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
    0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
    0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
    0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
    0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
    0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
    0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
    0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
    0x2d02ef8dUL
};

#define D 32
#define P 0x82F63B78 // Reflection of Castagnoli (0x11EDC6F41)

#define TILL_CYCLE 31
uint32_t _crc32c_pow_2k_table[TILL_CYCLE]; // because _crc32c_pow_2k_table[TILL_CYCLE == 31] == _crc32c_pow_2k_table[0]

// A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 8
// Listing 1: Multiplication of normalized polynomials
// "a" and "b" occupy D least significant bits.
uint32_t crc32c_multiply(uint32_t a, uint32_t b) {
  uint32_t product = 0;
  uint32_t b_pow_x_table[D + 1]; // b_pow_x_table[k] = (b * x**k) mod P
  b_pow_x_table[0] = b;
  for (int k = 0; k < D; ++k) {
    // If "a" has non-zero coefficient at x**k,/ add ((b * x**k) mod P) to the result.
    if ((a & (((uint32_t)1) << (D - 1 - k))) != 0) product ^= b_pow_x_table[k];

    // Compute b_pow_x_table[k+1] = (b ** x**(k+1)) mod P.
    if (b_pow_x_table[k] & 1) {
      // If degree of (b_pow_x_table[k] * x) is D, then
      // degree of (b_pow_x_table[k] * x - P) is less than D.
      b_pow_x_table[k + 1] = (b_pow_x_table[k] >> 1) ^ P;
    }
    else {
      b_pow_x_table[k + 1] = b_pow_x_table[k] >> 1;
    }
  }
  return product;
}
#undef D
#undef P

// A. Kadatch and B. Jenkins / Everything we know about CRC but afraid to forget September 3, 2010 9
void crc32c_init_pow_2k(void) {
  // _crc32c_pow_2k_table(0) =
  // x^(2^k) mod P(x) = x mod P(x) = x
  // Since we are operating on a reflected values
  // x = 10b, reflect(x) = 0x40000000
  _crc32c_pow_2k_table[0] = 0x40000000;

  for (int k = 1; k < TILL_CYCLE; k++) {
    // _crc32c_pow_2k_table(k+1) = _crc32c_pow_2k_table(k-1)^2 mod P(x)
    uint32_t tmp = _crc32c_pow_2k_table[k - 1];
    _crc32c_pow_2k_table[k] = crc32c_multiply(tmp, tmp);
  }
}

// x^N mod P(x)
uint32_t crc32c_f_pow_n(uint32_t n) {
  //            result = 1 (polynomial)
  uint32_t one, result = 0x80000000, i = 0;

  while (one = (n & 1), (n == 1 || n - one > 0)) {
    if (one) {
      result = crc32c_multiply(result, _crc32c_pow_2k_table[i]);
    }
    n >>= 1;
    i++;
  }

  return result;
}

juint *StubRoutines::x86::_crc32c_table;

void StubRoutines::x86::generate_CRC32C_table(bool is_pclmulqdq_table_supported) {

  static juint pow_n[CRC32C_NUM_PRECOMPUTED_CONSTANTS];

  crc32c_init_pow_2k();

  pow_n[0] = crc32c_f_pow_n(CRC32C_HIGH * 8);      // 8N * 8 = 64N
  pow_n[1] = crc32c_f_pow_n(CRC32C_HIGH * 8 * 2);  // 128N

  pow_n[2] = crc32c_f_pow_n(CRC32C_MIDDLE * 8);
  pow_n[3] = crc32c_f_pow_n(CRC32C_MIDDLE * 8 * 2);

  pow_n[4] = crc32c_f_pow_n(CRC32C_LOW * 8);
  pow_n[CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1] =
            crc32c_f_pow_n(CRC32C_LOW * 8 * 2);

  if (is_pclmulqdq_table_supported) {
    _crc32c_table = pow_n;
  } else {
    static julong pclmulqdq_table[CRC32C_NUM_PRECOMPUTED_CONSTANTS * 256];

    for (int j = 0; j < CRC32C_NUM_PRECOMPUTED_CONSTANTS; j++) {
      static juint X_CONST = pow_n[j];
      for (int64_t i = 0; i < 256; i++) { // to force 64 bit wide computations
      // S. Gueron / Information Processing Letters 112 (2012) 184
      // Algorithm 3: Generating a carry-less multiplication lookup table.
      // Input: A 32-bit constant, X_CONST.
      // Output: A table of 256 entries, each one is a 64-bit quadword,
      // that can be used for computing "byte" * X_CONST, for a given byte.
        pclmulqdq_table[j * 256 + i] =
          ((i & 1) * X_CONST) ^ ((i & 2) * X_CONST) ^ ((i & 4) * X_CONST) ^
          ((i & 8) * X_CONST) ^ ((i & 16) * X_CONST) ^ ((i & 32) * X_CONST) ^
          ((i & 64) * X_CONST) ^ ((i & 128) * X_CONST);
      }
    }
    _crc32c_table = (juint*)pclmulqdq_table;
  }
}