>>> hash(\"\\x01\")
128000384
>>> hash(\"\\x02\")
256000771
>>> hash(\"\\x03\")
384001154
>>> hash(\"\\x04\")
512001541
I don't think the accepted answer is really representative of cPython's internal hash implementations, which can be found in pyhash.c:
Description of algorithm for numeric types:
For numeric types, the hash of a number x is based on the reduction
of x modulo the prime P = 2**_PyHASH_BITS - 1. It's designed so that
hash(x) == hash(y) whenever x and y are numerically equal, even if
x and y have different types.
A quick summary of the hashing strategy:
(1) First define the 'reduction of x modulo P' for any rational
number x; this is a standard extension of the usual notion of
reduction modulo P for integers. If x == p/q (written in lowest
terms), the reduction is interpreted as the reduction of p times
the inverse of the reduction of q, all modulo P; if q is exactly
divisible by P then define the reduction to be infinity. So we've
got a well-defined map
reduce : { rational numbers } -> { 0, 1, 2, ..., P-1, infinity }.
(2) Now for a rational number x, define hash(x) by:
reduce(x) if x >= 0
-reduce(-x) if x < 0
If the result of the reduction is infinity (this is impossible for
integers, floats and Decimals) then use the predefined hash value
_PyHASH_INF for x >= 0, or -_PyHASH_INF for x < 0, instead.
_PyHASH_INF, -_PyHASH_INF and _PyHASH_NAN are also used for the
hashes of float and Decimal infinities and nans.
A selling point for the above strategy is that it makes it possible
to compute hashes of decimal and binary floating-point numbers
efficiently, even if the exponent of the binary or decimal number
is large. The key point is that
reduce(x * y) == reduce(x) * reduce(y) (modulo _PyHASH_MODULUS)
provided that {reduce(x), reduce(y)} != {0, infinity}. The reduction of a
binary or decimal float is never infinity, since the denominator is a power
of 2 (for binary) or a divisor of a power of 10 (for decimal). So we have,
for nonnegative x,
reduce(x * 2**e) == reduce(x) * reduce(2**e) % _PyHASH_MODULUS
reduce(x * 10**e) == reduce(x) * reduce(10**e) % _PyHASH_MODULUS
and reduce(10**e) can be computed efficiently by the usual modular
exponentiation algorithm. For reduce(2**e) it's even better: since
P is of the form 2**n-1, reduce(2**e) is 2**(e mod n), and multiplication
by 2**(e mod n) modulo 2**n-1 just amounts to a rotation of bits.
Hashing for doubles:
Py_hash_t
_Py_HashDouble(double v)
{
int e, sign;
double m;
Py_uhash_t x, y;
if (!Py_IS_FINITE(v)) {
if (Py_IS_INFINITY(v))
return v > 0 ? _PyHASH_INF : -_PyHASH_INF;
else
return _PyHASH_NAN;
}
m = frexp(v, &e);
sign = 1;
if (m < 0) {
sign = -1;
m = -m;
}
/* process 28 bits at a time; this should work well both for binary
and hexadecimal floating point. */
x = 0;
while (m) {
x = ((x << 28) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - 28);
m *= 268435456.0; /* 2**28 */
e -= 28;
y = (Py_uhash_t)m; /* pull out integer part */
m -= y;
x += y;
if (x >= _PyHASH_MODULUS)
x -= _PyHASH_MODULUS;
}
/* adjust for the exponent; first reduce it modulo _PyHASH_BITS */
e = e >= 0 ? e % _PyHASH_BITS : _PyHASH_BITS-1-((-1-e) % _PyHASH_BITS);
x = ((x << e) & _PyHASH_MODULUS) | x >> (_PyHASH_BITS - e);
x = x * sign;
if (x == (Py_uhash_t)-1)
x = (Py_uhash_t)-2;
return (Py_hash_t)x;
}
Hashing of tuples:
static Py_hash_t
tuplehash(PyTupleObject *v)
{
Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Py_hash_t y;
Py_ssize_t len = Py_SIZE(v);
PyObject **p;
Py_uhash_t mult = _PyHASH_MULTIPLIER;
x = 0x345678UL;
p = v->ob_item;
while (--len >= 0) {
y = PyObject_Hash(*p++);
if (y == -1)
return -1;
x = (x ^ y) * mult;
/* the cast might truncate len; that doesn't change hash stability */
mult += (Py_hash_t)(82520UL + len + len);
}
x += 97531UL;
if (x == (Py_uhash_t)-1)
x = -2;
return x;
}
The file also implements modified FNV hashing:
#if Py_HASH_ALGORITHM == Py_HASH_FNV
/* **************************************************************************
* Modified Fowler-Noll-Vo (FNV) hash function
*/
static Py_hash_t
fnv(const void *src, Py_ssize_t len)
{
const unsigned char *p = src;
Py_uhash_t x;
Py_ssize_t remainder, blocks;
union {
Py_uhash_t value;
unsigned char bytes[SIZEOF_PY_UHASH_T];
} block;
#ifdef Py_DEBUG
assert(_Py_HashSecret_Initialized);
#endif
remainder = len % SIZEOF_PY_UHASH_T;
if (remainder == 0) {
/* Process at least one block byte by byte to reduce hash collisions
* for strings with common prefixes. */
remainder = SIZEOF_PY_UHASH_T;
}
blocks = (len - remainder) / SIZEOF_PY_UHASH_T;
x = (Py_uhash_t) _Py_HashSecret.fnv.prefix;
x ^= (Py_uhash_t) *p << 7;
while (blocks--) {
PY_UHASH_CPY(block.bytes, p);
x = (_PyHASH_MULTIPLIER * x) ^ block.value;
p += SIZEOF_PY_UHASH_T;
}
/* add remainder */
for (; remainder > 0; remainder--)
x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *p++;
x ^= (Py_uhash_t) len;
x ^= (Py_uhash_t) _Py_HashSecret.fnv.suffix;
if (x == -1) {
x = -2;
}
return x;
}
static PyHash_FuncDef PyHash_Func = {fnv, "fnv", 8 * SIZEOF_PY_HASH_T,
16 * SIZEOF_PY_HASH_T};
#endif /* Py_HASH_ALGORITHM == Py_HASH_FNV */
According to PEP 456, SipHash (MIT License) is the default string and bytes hash algorithm:
/* byte swap little endian to host endian
* Endian conversion not only ensures that the hash function returns the same
* value on all platforms. It is also required to for a good dispersion of
* the hash values' least significant bits.
*/
#if PY_LITTLE_ENDIAN
# define _le64toh(x) ((uint64_t)(x))
#elif defined(__APPLE__)
# define _le64toh(x) OSSwapLittleToHostInt64(x)
#elif defined(HAVE_LETOH64)
# define _le64toh(x) le64toh(x)
#else
# define _le64toh(x) (((uint64_t)(x) << 56) | \
(((uint64_t)(x) << 40) & 0xff000000000000ULL) | \
(((uint64_t)(x) << 24) & 0xff0000000000ULL) | \
(((uint64_t)(x) << 8) & 0xff00000000ULL) | \
(((uint64_t)(x) >> 8) & 0xff000000ULL) | \
(((uint64_t)(x) >> 24) & 0xff0000ULL) | \
(((uint64_t)(x) >> 40) & 0xff00ULL) | \
((uint64_t)(x) >> 56))
#endif
#ifdef _MSC_VER
# define ROTATE(x, b) _rotl64(x, b)
#else
# define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) )
#endif
#define HALF_ROUND(a,b,c,d,s,t) \
a += b; c += d; \
b = ROTATE(b, s) ^ a; \
d = ROTATE(d, t) ^ c; \
a = ROTATE(a, 32);
#define DOUBLE_ROUND(v0,v1,v2,v3) \
HALF_ROUND(v0,v1,v2,v3,13,16); \
HALF_ROUND(v2,v1,v0,v3,17,21); \
HALF_ROUND(v0,v1,v2,v3,13,16); \
HALF_ROUND(v2,v1,v0,v3,17,21);
static uint64_t
siphash24(uint64_t k0, uint64_t k1, const void *src, Py_ssize_t src_sz) {
uint64_t b = (uint64_t)src_sz << 56;
const uint64_t *in = (uint64_t*)src;
uint64_t v0 = k0 ^ 0x736f6d6570736575ULL;
uint64_t v1 = k1 ^ 0x646f72616e646f6dULL;
uint64_t v2 = k0 ^ 0x6c7967656e657261ULL;
uint64_t v3 = k1 ^ 0x7465646279746573ULL;
uint64_t t;
uint8_t *pt;
uint8_t *m;
while (src_sz >= 8) {
uint64_t mi = _le64toh(*in);
in += 1;
src_sz -= 8;
v3 ^= mi;
DOUBLE_ROUND(v0,v1,v2,v3);
v0 ^= mi;
}
t = 0;
pt = (uint8_t *)&t;
m = (uint8_t *)in;
switch (src_sz) {
case 7: pt[6] = m[6]; /* fall through */
case 6: pt[5] = m[5]; /* fall through */
case 5: pt[4] = m[4]; /* fall through */
case 4: memcpy(pt, m, sizeof(uint32_t)); break;
case 3: pt[2] = m[2]; /* fall through */
case 2: pt[1] = m[1]; /* fall through */
case 1: pt[0] = m[0]; /* fall through */
}
b |= _le64toh(t);
v3 ^= b;
DOUBLE_ROUND(v0,v1,v2,v3);
v0 ^= b;
v2 ^= 0xff;
DOUBLE_ROUND(v0,v1,v2,v3);
DOUBLE_ROUND(v0,v1,v2,v3);
/* modified */
t = (v0 ^ v1) ^ (v2 ^ v3);
return t;
}
static Py_hash_t
pysiphash(const void *src, Py_ssize_t src_sz) {
return (Py_hash_t)siphash24(
_le64toh(_Py_HashSecret.siphash.k0), _le64toh(_Py_HashSecret.siphash.k1),
src, src_sz);
}
uint64_t
_Py_KeyedHash(uint64_t key, const void *src, Py_ssize_t src_sz)
{
return siphash24(key, 0, src, src_sz);
}
#if Py_HASH_ALGORITHM == Py_HASH_SIPHASH24
static PyHash_FuncDef PyHash_Func = {pysiphash, "siphash24", 64, 128};
#endif
If you download the source code of Python, you will find it for sure! But bear in mind the hash function is implemented for each kind of objects differently.
For example, you will find the unicode hash function in Objects/unicodeobject.c
in the function unicode_hash
. You might have to look a bit more to find the string hash function. Find the structure defining the object you are interested in, and in the field tp_hash
, you will find the function that compute the hash code of that object.
For the string object: The exact code is found in Objects/stringobject.c
in the function string_hash
:
static long string_hash(PyStringObject *a)
{
register Py_ssize_t len;
register unsigned char *p;
register long x;
if (a->ob_shash != -1)
return a->ob_shash;
len = Py_SIZE(a);
p = (unsigned char *) a->ob_sval;
x = *p << 7;
while (--len >= 0)
x = (1000003*x) ^ *p++;
x ^= Py_SIZE(a);
if (x == -1)
x = -2;
a->ob_shash = x;
return x;
}