linux-x32-abi

How to detect X32 ABI or environment in the preprocessor?

老子叫甜甜 — Sun, 22 Dec 2019 18:58:21 +0000

How to detect X32 ABI or environment in the preprocessor? 老子叫甜甜 2019-12-23 02:58:21

问题

X32 is an ABI for amd64/x86_64 CPUs using 32-bit pointers. The idea is to combine the larger register set of x86_64 with the smaller memory and cache footprint resulting from 32-bit pointers. It provides up to about a 40% speedup. See Difference between x86, x32, and x64 architectures on Stack Overflow, and the Debian X32 Ports wiki page for details and setting it up as a chroot environment.

We have a bug report from a Debian maintainer under the environment. The report is adcq is an illegal instruction. The inline assembly is activated based on preprocessor macros, so we are not detecting X32 properly (or more correctly, not at all until now).

The most obvious choice (to me) for a preprocessor macro is something like __X32__, but that's not offered. Based on Clang's patch and Debian's suggestion, it looks like __ILP32__ can be used. But I'd like a more canonical answer since _ILP32 and __code_model_small__ look interesting, too. (And I'm aware of issues with SSE2, where the compiler supported it but the OS did not).

What are the preprocessor macros that can be used to reliably detect an X32 ABI and environment when using Clang and GCC?

To be clear, I'm not trying to fix the code at this point. I just want to know the macros that can be used in a complete remediation.

# cpp -dM < /dev/null | sort
#define __amd64 1
#define __amd64__ 1
#define __ATOMIC_ACQ_REL 4
#define __ATOMIC_ACQUIRE 2
#define __ATOMIC_CONSUME 1
#define __ATOMIC_HLE_ACQUIRE 65536
#define __ATOMIC_HLE_RELEASE 131072
#define __ATOMIC_RELAXED 0
#define __ATOMIC_RELEASE 3
#define __ATOMIC_SEQ_CST 5
#define __BIGGEST_ALIGNMENT__ 16
#define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
#define __CHAR16_TYPE__ short unsigned int
#define __CHAR32_TYPE__ unsigned int
#define __CHAR_BIT__ 8
#define __code_model_small__ 1
#define __DBL_DECIMAL_DIG__ 17
#define __DBL_DENORM_MIN__ ((double)4.94065645841246544177e-324L)
#define __DBL_DIG__ 15
#define __DBL_EPSILON__ ((double)2.22044604925031308085e-16L)
#define __DBL_HAS_DENORM__ 1
#define __DBL_HAS_INFINITY__ 1
#define __DBL_HAS_QUIET_NAN__ 1
#define __DBL_MANT_DIG__ 53
#define __DBL_MAX_10_EXP__ 308
#define __DBL_MAX__ ((double)1.79769313486231570815e+308L)
#define __DBL_MAX_EXP__ 1024
#define __DBL_MIN_10_EXP__ (-307)
#define __DBL_MIN__ ((double)2.22507385850720138309e-308L)
#define __DBL_MIN_EXP__ (-1021)
#define __DEC128_EPSILON__ 1E-33DL
#define __DEC128_MANT_DIG__ 34
#define __DEC128_MAX__ 9.999999999999999999999999999999999E6144DL
#define __DEC128_MAX_EXP__ 6145
#define __DEC128_MIN__ 1E-6143DL
#define __DEC128_MIN_EXP__ (-6142)
#define __DEC128_SUBNORMAL_MIN__ 0.000000000000000000000000000000001E-6143DL
#define __DEC32_EPSILON__ 1E-6DF
#define __DEC32_MANT_DIG__ 7
#define __DEC32_MAX__ 9.999999E96DF
#define __DEC32_MAX_EXP__ 97
#define __DEC32_MIN__ 1E-95DF
#define __DEC32_MIN_EXP__ (-94)
#define __DEC32_SUBNORMAL_MIN__ 0.000001E-95DF
#define __DEC64_EPSILON__ 1E-15DD
#define __DEC64_MANT_DIG__ 16
#define __DEC64_MAX__ 9.999999999999999E384DD
#define __DEC64_MAX_EXP__ 385
#define __DEC64_MIN__ 1E-383DD
#define __DEC64_MIN_EXP__ (-382)
#define __DEC64_SUBNORMAL_MIN__ 0.000000000000001E-383DD
#define __DEC_EVAL_METHOD__ 2
#define __DECIMAL_BID_FORMAT__ 1
#define __DECIMAL_DIG__ 21
#define __ELF__ 1
#define __FINITE_MATH_ONLY__ 0
#define __FLOAT_WORD_ORDER__ __ORDER_LITTLE_ENDIAN__
#define __FLT_DECIMAL_DIG__ 9
#define __FLT_DENORM_MIN__ 1.40129846432481707092e-45F
#define __FLT_DIG__ 6
#define __FLT_EPSILON__ 1.19209289550781250000e-7F
#define __FLT_EVAL_METHOD__ 0
#define __FLT_HAS_DENORM__ 1
#define __FLT_HAS_INFINITY__ 1
#define __FLT_HAS_QUIET_NAN__ 1
#define __FLT_MANT_DIG__ 24
#define __FLT_MAX_10_EXP__ 38
#define __FLT_MAX__ 3.40282346638528859812e+38F
#define __FLT_MAX_EXP__ 128
#define __FLT_MIN_10_EXP__ (-37)
#define __FLT_MIN__ 1.17549435082228750797e-38F
#define __FLT_MIN_EXP__ (-125)
#define __FLT_RADIX__ 2
#define __FXSR__ 1
#define __GCC_ATOMIC_BOOL_LOCK_FREE 2
#define __GCC_ATOMIC_CHAR16_T_LOCK_FREE 2
#define __GCC_ATOMIC_CHAR32_T_LOCK_FREE 2
#define __GCC_ATOMIC_CHAR_LOCK_FREE 2
#define __GCC_ATOMIC_INT_LOCK_FREE 2
#define __GCC_ATOMIC_LLONG_LOCK_FREE 2
#define __GCC_ATOMIC_LONG_LOCK_FREE 2
#define __GCC_ATOMIC_POINTER_LOCK_FREE 2
#define __GCC_ATOMIC_SHORT_LOCK_FREE 2
#define __GCC_ATOMIC_TEST_AND_SET_TRUEVAL 1
#define __GCC_ATOMIC_WCHAR_T_LOCK_FREE 2
#define __GCC_HAVE_DWARF2_CFI_ASM 1
#define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
#define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
#define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
#define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
#define __GCC_IEC_559 2
#define __GCC_IEC_559_COMPLEX 2
#define __GNUC__ 5
#define __GNUC_MINOR__ 2
#define __GNUC_PATCHLEVEL__ 1
#define __GNUC_STDC_INLINE__ 1
#define __gnu_linux__ 1
#define __GXX_ABI_VERSION 1009
#define __has_include_next(STR) __has_include_next__(STR)
#define __has_include(STR) __has_include__(STR)
#define __ILP32__ 1
#define _ILP32 1
#define __INT16_C(c) c
#define __INT16_MAX__ 0x7fff
#define __INT16_TYPE__ short int
#define __INT32_C(c) c
#define __INT32_MAX__ 0x7fffffff
#define __INT32_TYPE__ int
#define __INT64_C(c) c ## LL
#define __INT64_MAX__ 0x7fffffffffffffffLL
#define __INT64_TYPE__ long long int
#define __INT8_C(c) c
#define __INT8_MAX__ 0x7f
#define __INT8_TYPE__ signed char
#define __INT_FAST16_MAX__ 0x7fffffff
#define __INT_FAST16_TYPE__ int
#define __INT_FAST32_MAX__ 0x7fffffff
#define __INT_FAST32_TYPE__ int
#define __INT_FAST64_MAX__ 0x7fffffffffffffffLL
#define __INT_FAST64_TYPE__ long long int
#define __INT_FAST8_MAX__ 0x7f
#define __INT_FAST8_TYPE__ signed char
#define __INT_LEAST16_MAX__ 0x7fff
#define __INT_LEAST16_TYPE__ short int
#define __INT_LEAST32_MAX__ 0x7fffffff
#define __INT_LEAST32_TYPE__ int
#define __INT_LEAST64_MAX__ 0x7fffffffffffffffLL
#define __INT_LEAST64_TYPE__ long long int
#define __INT_LEAST8_MAX__ 0x7f
#define __INT_LEAST8_TYPE__ signed char
#define __INT_MAX__ 0x7fffffff
#define __INTMAX_C(c) c ## LL
#define __INTMAX_MAX__ 0x7fffffffffffffffLL
#define __INTMAX_TYPE__ long long int
#define __INTPTR_MAX__ 0x7fffffff
#define __INTPTR_TYPE__ int
#define __k8 1
#define __k8__ 1
#define __LDBL_DENORM_MIN__ 3.64519953188247460253e-4951L
#define __LDBL_DIG__ 18
#define __LDBL_EPSILON__ 1.08420217248550443401e-19L
#define __LDBL_HAS_DENORM__ 1
#define __LDBL_HAS_INFINITY__ 1
#define __LDBL_HAS_QUIET_NAN__ 1
#define __LDBL_MANT_DIG__ 64
#define __LDBL_MAX_10_EXP__ 4932
#define __LDBL_MAX__ 1.18973149535723176502e+4932L
#define __LDBL_MAX_EXP__ 16384
#define __LDBL_MIN_10_EXP__ (-4931)
#define __LDBL_MIN__ 3.36210314311209350626e-4932L
#define __LDBL_MIN_EXP__ (-16381)
#define __linux 1
#define __linux__ 1
#define linux 1
#define __LONG_LONG_MAX__ 0x7fffffffffffffffLL
#define __LONG_MAX__ 0x7fffffffL
#define __MMX__ 1
#define __NO_INLINE__ 1
#define __ORDER_BIG_ENDIAN__ 4321
#define __ORDER_LITTLE_ENDIAN__ 1234
#define __ORDER_PDP_ENDIAN__ 3412
#define __PRAGMA_REDEFINE_EXTNAME 1
#define __PTRDIFF_MAX__ 0x7fffffff
#define __PTRDIFF_TYPE__ int
#define __REGISTER_PREFIX__ 
#define __SCHAR_MAX__ 0x7f
#define __SHRT_MAX__ 0x7fff
#define __SIG_ATOMIC_MAX__ 0x7fffffff
#define __SIG_ATOMIC_MIN__ (-__SIG_ATOMIC_MAX__ - 1)
#define __SIG_ATOMIC_TYPE__ int
#define __SIZE_MAX__ 0xffffffffU
#define __SIZEOF_DOUBLE__ 8
#define __SIZEOF_FLOAT128__ 16
#define __SIZEOF_FLOAT__ 4
#define __SIZEOF_FLOAT80__ 16
#define __SIZEOF_INT128__ 16
#define __SIZEOF_INT__ 4
#define __SIZEOF_LONG__ 4
#define __SIZEOF_LONG_DOUBLE__ 16
#define __SIZEOF_LONG_LONG__ 8
#define __SIZEOF_POINTER__ 4
#define __SIZEOF_PTRDIFF_T__ 4
#define __SIZEOF_SHORT__ 2
#define __SIZEOF_SIZE_T__ 4
#define __SIZEOF_WCHAR_T__ 4
#define __SIZEOF_WINT_T__ 4
#define __SIZE_TYPE__ unsigned int
#define __SSE__ 1
#define __SSE2__ 1
#define __SSE2_MATH__ 1
#define __SSE_MATH__ 1
#define __STDC__ 1
#define __STDC_HOSTED__ 1
#define __STDC_IEC_559__ 1
#define __STDC_IEC_559_COMPLEX__ 1
#define __STDC_ISO_10646__ 201103L
#define __STDC_NO_THREADS__ 1
#define _STDC_PREDEF_H 1
#define __STDC_UTF_16__ 1
#define __STDC_UTF_32__ 1
#define __STDC_VERSION__ 201112L
#define __UINT16_C(c) c
#define __UINT16_MAX__ 0xffff
#define __UINT16_TYPE__ short unsigned int
#define __UINT32_C(c) c ## U
#define __UINT32_MAX__ 0xffffffffU
#define __UINT32_TYPE__ unsigned int
#define __UINT64_C(c) c ## ULL
#define __UINT64_MAX__ 0xffffffffffffffffULL
#define __UINT64_TYPE__ long long unsigned int
#define __UINT8_C(c) c
#define __UINT8_MAX__ 0xff
#define __UINT8_TYPE__ unsigned char
#define __UINT_FAST16_MAX__ 0xffffffffU
#define __UINT_FAST16_TYPE__ unsigned int
#define __UINT_FAST32_MAX__ 0xffffffffU
#define __UINT_FAST32_TYPE__ unsigned int
#define __UINT_FAST64_MAX__ 0xffffffffffffffffULL
#define __UINT_FAST64_TYPE__ long long unsigned int
#define __UINT_FAST8_MAX__ 0xff
#define __UINT_FAST8_TYPE__ unsigned char
#define __UINT_LEAST16_MAX__ 0xffff
#define __UINT_LEAST16_TYPE__ short unsigned int
#define __UINT_LEAST32_MAX__ 0xffffffffU
#define __UINT_LEAST32_TYPE__ unsigned int
#define __UINT_LEAST64_MAX__ 0xffffffffffffffffULL
#define __UINT_LEAST64_TYPE__ long long unsigned int
#define __UINT_LEAST8_MAX__ 0xff
#define __UINT_LEAST8_TYPE__ unsigned char
#define __UINTMAX_C(c) c ## ULL
#define __UINTMAX_MAX__ 0xffffffffffffffffULL
#define __UINTMAX_TYPE__ long long unsigned int
#define __UINTPTR_MAX__ 0xffffffffU
#define __UINTPTR_TYPE__ unsigned int
#define __unix 1
#define __unix__ 1
#define unix 1
#define __USER_LABEL_PREFIX__ 
#define __VERSION__ "5.2.1 20150911"
#define __WCHAR_MAX__ 0x7fffffffL
#define __WCHAR_MIN__ (-__WCHAR_MAX__ - 1)
#define __WCHAR_TYPE__ long int
#define __WINT_MAX__ 0xffffffffU
#define __WINT_MIN__ 0U
#define __WINT_TYPE__ unsigned int
#define __x86_64 1
#define __x86_64__ 1

回答1:

There doesn't appear to be a predefined macro that explicitly specifies an x32 environment. Comparing the output of cpp -dM and cpp -dM -mx32, the symbols _ILP32 and __ILP32__ are defined only for x32, and _LP64 and __LP64__ are defined only for x86_64 without x32. A number of other predefined macros have different values for the two environments.

I think the most straightforward way to detect x32 at compile time is to examine the __x86_64__ and SIZE_MAX macros. The former is predefined (or not) by gcc, and the latter is defined in <stdint.h>.

The following program demonstrates this. It works correctly with gcc -m64, gcc -m32, and gcc -mx32 on an x86_64 system,, and with gcc on a non-x86_64 system (SPARC).

#include <stdio.h>
#include <stdint.h>
int main(void) {
#ifdef __x86_64__
    #if SIZE_MAX == 0xFFFFFFFF
        puts("This is x32");
    #else
        puts("This is x86_64 but not x32");
    #endif
#else
    puts("This is not x64_64");
#endif
}

来源：https://stackoverflow.com/questions/32645109/how-to-detect-x32-abi-or-environment-in-the-preprocessor

标签

gcc

clang

c-preprocessor

linux-x32-abi

64-bit executable runs slower than 32-bit version

无人久伴 — Thu, 19 Dec 2019 19:07:17 +0000

64-bit executable runs slower than 32-bit version 无人久伴 2019-12-20 03:07:17

问题

I have a 64-bit Ubuntu 13.04 system. I was curious to see how 32-bit applications perform against 64-bit applications on a 64-bit system so I compiled the following C program as 32-bit and 64-bit executable and recorded the time they took to execute. I used gcc flags to compile for 3 different architectures:

-m32: Intel 80386 architecture (int, long, pointer all set to 32 bits (ILP32))
-m64: AMD's x86-64 architecture (int 32 bits; long, pointer 64 bits (LP64))
-mx32: AMD's x86-64 architecture (int, long, pointer all set to 32 bits (ILP32), but CPU in long mode with sixteen 64b registers, and register call ABI)

// this program solves the
// project euler problem 16.

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <assert.h>
#include <sys/time.h>

int sumdigit(int a, int b);

int main(void) {
    int a = 2;
    int b = 10000;
    struct timeval start, finish;
    unsigned int i;
    gettimeofday(&start, NULL);
    for(i = 0; i < 1000; i++)
        (void)sumdigit(a, b);
    gettimeofday(&finish, NULL);
    printf("Did %u calls in %.4g seconds\n", 
            i, 
            finish.tv_sec - start.tv_sec + 1E-6 * (finish.tv_usec - start.tv_usec));
    return 0;
}

int sumdigit(int a, int b) {
    // numlen = number of digit in a^b
    // pcount = power of 'a' after ith iteration
    // dcount = number of digit in a^(pcount)

    int numlen = (int) (b * log10(a)) + 1;
    char *arr = calloc(numlen, sizeof *arr);
    int pcount = 0;
    int dcount = 1;
    arr[numlen - 1] = 1;
    int i, sum, carry;

    while(pcount < b) {
        pcount += 1;

        sum = 0; 
        carry = 0;

        for(i = numlen - 1; i >= numlen - dcount; --i) {
            sum = arr[i] * a + carry;
            carry = sum / 10;
            arr[i] = sum % 10;
        }

        while(carry > 0) {
            dcount += 1;
            sum = arr[numlen - dcount] + carry;
            carry = sum / 10;
            arr[numlen - dcount] = sum % 10;
        } 
    }

    int result = 0;
    for(i = numlen - dcount; i < numlen; ++i)
        result += arr[i];

    free(arr);
    return result;
}

The commands I used to get different executable:

gcc -std=c99 -Wall -Wextra -Werror -pedantic -pedantic-errors pe16.c -o pe16_x32 -lm -mx32
gcc -std=c99 -Wall -Wextra -Werror -pedantic -pedantic-errors pe16.c -o pe16_32 -lm -m32
gcc -std=c99 -Wall -Wextra -Werror -pedantic -pedantic-errors pe16.c -o pe16_64 -lm

Here are the results I got:

ajay@ajay:c$ ./pe16_x32
Did 1000 calls in 89.19 seconds

ajay@ajay:c$ ./pe16_32
Did 1000 calls in 88.82 seconds

ajay@ajay:c$ ./pe16_64
Did 1000 calls in 92.05 seconds

Why does the 64-bit version runs slower than the 32-bit one? I read that the 64-bit architecture has improved instruction set and twice more general purpose registers compared to the 32-bit architecture which allows for more optimizations. When can I expect a better performance on a 64-bit system?

Edit I turned on the optimization using -O3 flag and now the results are:

ajay@ajay:c$ ./pe16_x32
Did 1000 calls in 38.07 seconds

ajay@ajay:c$ ./pe16_32
Did 1000 calls in 38.32 seconds

ajay@ajay:c$ ./pe16_64
Did 1000 calls in 38.27 seconds

回答1:

Comparing performance of code without optimisations is rather pointless. If you care about performance, you'll only ever use optimised code.

And when you enable optimisations you find that the performance differences are negligible. That is to be expected. The operations you perform are all integer based operations, using data of the same size in all cases. Since the 32 bit and 64 bit code run on the same integer hardware units you should expect the same performance.

You are not using any floating point operations which is one area where there are sometimes differences between 32 and 64 bit code due to different floating point hardware units (x64 uses SSE, x86 may use x87).

In short, the results are exactly as expected.

来源：https://stackoverflow.com/questions/21525610/64-bit-executable-runs-slower-than-32-bit-version

标签

How should the [u]int_fastN_t types be defined for x86_64, with or without the x32 ABI?

梦想的初衷 — Tue, 10 Dec 2019 07:13:29 +0000

How should the [u]int_fastN_t types be defined for x86_64, with or without the x32 ABI? 梦想的初衷 2019-12-10 15:13:29

问题

The x32 ABI specifies, among other things, 32-bit pointers for code generated for the x86_64 architecture. It combines the advantages of the x86_64 architecture (including 64-bit CPU registers) with the reduced overhead of 32-bit pointers.

The <stdint.h> header defines typedefs int_fast8_t, int_fast16_t, int_fast32_t, and int_fast64_t (and corresponding unsigned types uint_fast8_t et al), each of which is:

an integer type that is usually fastest to operate with among all integer types that have at least the specified width

with a footnote:

The designated type is not guaranteed to be fastest for all purposes; if the implementation has no clear grounds for choosing one type over another, it will simply pick some integer type satisfying the signedness and width requirements.

(Quoted from the N1570 C11 draft.)

The question is, how should [u]int_fast16_t and [u]int_fast32_t types be defined for the x86_64 architecture, with or without the x32 ABI? Is there an x32 document that specifies these types? Should they be compatible with the 32-bit x86 definitions (both 32 bits) or, since x32 has access to 64-bit CPU registers, should they be the same size with or without the x32 ABI? (Note that the x86_64 has 64-bit registers regardless of whether the x32 ABI is in use or not.)

Here's a test program (which depends on the gcc-specific __x86_64__ macro):

#include <stdio.h>
#include <stdint.h>
#include <limits.h>

int main(void) {
#if defined __x86_64__ && SIZE_MAX == 0xFFFFFFFF
    puts("This is x86_64 with the x32 ABI");
#elif defined __x86_64__ && SIZE_MAX > 0xFFFFFFFF
    puts("This is x86_64 without the x32 ABI");
#else
    puts("This is not x86_64");
#endif
    printf("uint_fast8_t  is %2zu bits\n", CHAR_BIT * sizeof (uint_fast8_t));
    printf("uint_fast16_t is %2zu bits\n", CHAR_BIT * sizeof (uint_fast16_t));
    printf("uint_fast32_t is %2zu bits\n", CHAR_BIT * sizeof (uint_fast32_t));
    printf("uint_fast64_t is %2zu bits\n", CHAR_BIT * sizeof (uint_fast64_t));
}

When I compile it with gcc -m64, the output is:

This is x86_64 without the x32 ABI
uint_fast8_t  is  8 bits
uint_fast16_t is 64 bits
uint_fast32_t is 64 bits
uint_fast64_t is 64 bits

When I compile it with gcc -mx32, the output is:

This is x86_64 with the x32 ABI
uint_fast8_t  is  8 bits
uint_fast16_t is 32 bits
uint_fast32_t is 32 bits
uint_fast64_t is 64 bits

(which, apart from the first line, matches the output with gcc -m32, which generates 32-bit x86 code).

Is this a bug in glibc (which defines the <stdint.h> header), or is it following some x32 ABI requirement? There are no references to the [u]int_fastN_t types in either the x32 ABI document or the x86_64 ABI document, but there could be something else that specifies it.

One could argue that the fast16 and fast32 types should be 64 bits with or with x32, since 64-bit registers are available; would that makes more sense that the current behavior?

(I've substantially edited the original question, which asked only about the x32 ABI. The question now asks about x86_64 with or without x32.)

回答1:

Generally speaking you would expect 32-bit integer types to be marginally faster than 64-bit integer types on x86-64 CPUs. Partly because they use less memory, but also because 64-bit instructions require an extra prefix byte over their 32-bit counterparts. The 32-bit division instruction is significantly faster than 64-bit one, but otherwise instruction execution latencies are the same.

It isn't normally necessary to extend 32-bit when loading them into 64-bit registers. While the CPU automatically zero-extends the values in this case, this is usually only a benefit because it avoids partial register stalls. What gets loaded into upper part of the register is less important than the fact that the entire register is modified. The contents of the upper part of the register don't matter because when they're used to hold 32-bit types they're normally only used with 32-bit instructions that only work with the lower 32-bit part of the register.

The inconsistency between between the sizes of int_fast32_t types when using the x32 and x86-64 ABIs is probably best justified by the fact that pointers are 64 bits wide. Whenever a 32-bit integer is added to a pointer it would need to be extended, making this a much more likely occurrence when using the x86-64 ABI.

Another factor to consider is that whole point of the x32 ABI is to get better performance by using smaller types. Any application that benefits from pointers and related types being smaller should also benefit from int_fast32_t being smaller as well.

回答2:

I have compiled the following sample code to check the generated code for a simple sum with different integer types:

#include <stdint.h>

typedef int16_t INT;
//typedef int32_t INT;
//typedef int64_t INT;

INT foo()
{
    volatile INT a = 1, b = 2;
    return a + b;
}

And then I disassembled the code generated with each of the integer types. The compilation command is gcc -Ofast -mx32 -c test.c. Note that in full 64-bit mode the generated code will be almost the same because there are no pointers in my code (only %rsp instead of %esp).

With int16_t it emits:

00000000 <foo>:
   0:   b8 01 00 00 00          mov    $0x1,%eax
   5:   ba 02 00 00 00          mov    $0x2,%edx
   a:   67 66 89 44 24 fc       mov    %ax,-0x4(%esp)
  10:   67 66 89 54 24 fe       mov    %dx,-0x2(%esp)
  16:   67 0f b7 54 24 fc       movzwl -0x4(%esp),%edx
  1c:   67 0f b7 44 24 fe       movzwl -0x2(%esp),%eax
  22:   01 d0                   add    %edx,%eax
  24:   c3                      retq

With int32_t:

00000000 <foo>:
   0:   67 c7 44 24 f8 01 00 00 00  movl   $0x1,-0x8(%esp)
   9:   67 c7 44 24 fc 02 00 00 00  movl   $0x2,-0x4(%esp)
  12:   67 8b 54 24 f8              mov    -0x8(%esp),%edx
  17:   67 8b 44 24 fc              mov    -0x4(%esp),%eax
  1c:   01 d0                       add    %edx,%eax
  1e:   c3                          retq

And with int64_t:

00000000 <foo>:
   0:   67 48 c7 44 24 f0 01 00 00 00   movq   $0x1,-0x10(%esp)
   a:   67 48 c7 44 24 f8 02 00 00 00   movq   $0x2,-0x8(%esp)
  14:   67 48 8b 54 24 f0               mov    -0x10(%esp),%rdx
  1a:   67 48 8b 44 24 f8               mov    -0x8(%esp),%rax
  20:   48 01 d0                        add    %rdx,%rax
  23:   c3                              retq

Now, I don't claim to know exactly why the compiler generated exactly this code (maybe the volatile keyword combined with a non-register-size integer type is not the best choice?). But from that generated code we can draw the following conclusions:

The slowest type is int16_t. It needs additional instructions to move the values around.
The fastest type is int32_t. Although the 32-bit and the 64-bit versions have the same number of instructions, the 32-bit code is shorter in bytes, so it will be more cache friendly, so faster.

So the natural choices for the fast types would be:

For int_fast16_t, choose int32_t.
For int_fast32_t, choose int32_t.
For int_fast64_t, choose int64_t (what else).

回答3:

Tough. Let's just take int_fast8_t. If a developer uses a large array to store lots of 8 bit signed integers, then int8_t will be fastest because of caching. I'd declare that using large arrays of int_fast8_t is likely a bad idea.

You'd need to take a large codebase, and systematically replace int8_t and signed chars and plain char if it is signed with int_fast8_t. Then benchmark the code using different typedefs for int_fast8_t, and measure what's fastest.

Note that undefined behaviour is going to change. For example assigning 255 will give a result of -1 if the type is int8_t and 255 otherwise.

来源：https://stackoverflow.com/questions/36961100/how-should-the-uint-fastn-t-types-be-defined-for-x86-64-with-or-without-the-x

标签

stdint

linux-x32-abi

How to detect X32 on Windows?

喜欢而已 — Sat, 07 Dec 2019 19:51:15 +0000

How to detect X32 on Windows? 喜欢而已 2019-12-08 03:51:15

问题

X32 allows one to write programs using 32-bit integers, longs and pointers that run on x86_64 processors. Using X32 has a number of benefits under certain use cases. (X32 is different than X86 or X64; see Difference between x86, x32, and x64 architectures for more details).

It appears some Windows Enterprise Server supports X32, but I'm having trouble finding more information on it. That's based on some Intel PDFs, like Intel® Xeon® Processor E5-2400 Series-based Platforms for Intelligent Systems:

Microsoft's documentation on Predefined Macros lists the usual suspect, like _M_X64 and _M_AMD64. But it does not appear to discuss an architecture option for X32.

If Microsoft supports X32, then I suspect it is going to be an option similar to large address space aware or terminal service aware.

Does Microsoft actually support X32 (as opposed to X86 and X64)?

If so, how can I determine when X32 is being selected under Windows?
If not, then why does Intel specifically call out the X32 platform for Windows?

回答1:

The question

Does Microsoft actually support X32 (as opposed to X86 and X64)?

TL;DR answer

The answer is "No, it's not supported by Microsoft." The preprocessor macros don't lead to any identification of X32, the command line options and IDE options don't exist, and the strings identifying such a compiler don't exist.

The long answer — Part I

"There are no header strings for X32"

Disregarding the following facts:

no official documentation of such a feature exists,
no option in Visual Studio or cl.exe /? to enable/disable it exists, and
strings -el clui.dll shows no sign of such an option,

strings -el "%VCINSTALLDIR%\bin\1033\clui.dll" | find "Microsoft (R)" shows no sign of a matching header string either:

4Microsoft (R) C/C++ Optimizing Compiler Version %s
-for Microsoft (R) .NET Framework version %s
(Microsoft (R) C/C++ Optimizing Compiler
FMicrosoft (R) C/C++ Optimizing Compiler Version %s for MIPS R-Series
)Microsoft (R) MIPS Assembler Version %s
CMicrosoft (R) C/C++ Optimizing Compiler Version %s for Renesas SH
<Microsoft (R) C/C++ Optimizing Compiler Version %s for ARM
:Microsoft (R) C/C++ Standard Compiler Version %s for x86
<Microsoft (R) C/C++ Optimizing Compiler Version %s for x86
GMicrosoft (R) 32-bit C/C++ Optimizing Compiler Version %s for PowerPC
@Microsoft (R) C/C++ Optimizing Compiler Version %s for Itanium
<Microsoft (R) C/C++ Optimizing Compiler Version %s for x64
>Microsoft (R) C/C++ Optimizing Compiler Version %s for ARM64
Microsoft (R) MIPS Assembler

The same output is seen in the bin\x86_amd64\1033\clui.dll and bin\x86_arm\1033\clui.dll files, so it's not like that one file simply didn't include it.

The long answer — Part II

"Windows doesn't do data models"

Let's suppose it did. How would you detect it? In the case of GLIBC, __ILP32__ is defined for x32 and x86 while __LP64__ is defined for amd64, denoting the data model used. Additionally, __x86_64__ will be defined for the AMD64 architecture. If __x86_64__ is defined and __ILP32__ is defined, then you're using the X32 ABI, else you're using the AMD64 ABI. For C, that's all that matters. If you're utilizing assembly code, that's where the differentiation between the X32 ABI and the x86 ABI matters, hence checking __x86_64__ to determine that the architecture targeted is 64-bit and checking __ILP32__ to determine whether the 32-bit or 64-bit ABI is in use. For example:

#ifdef __x86_64__
# ifdef __ILP32__

// Use X32 version of myfunc().
extern long myfunc_x32 (const char *);
long (*myfunc)(const char *) = myfunc_x32;

# else /* !__ILP32__ */

// Use AMD64 version of myfunc().
extern long myfunc_amd64 (const char *);
long (*myfunc)(const char *) = myfunc_amd64;

# endif /* __ILP32__ */

/* !__x86_64__ */
#elif defined __i386__

// Use x86 version of myfunc().
extern long myfunc_x86 (const char *);
long (*myfunc)(const char *) = myfunc_x86;

/* !__i386__ */
#else

// Use generic version of myfunc() since no optimized versions are available.
long myfunc(const char *);

#endif /* __x86_64__ */

However, there is no macro indicating the data model on Windows. You target one of the following architectures:

32-bit x86 (_M_IX86)
64-bit AMD64 (_M_AMD64/_M_X64)
(32-bit?) ARM (_M_ARM)

Theoretically one could use _M_AMD64 and _M_X64 independently to determine whether X32 exists, but if _M_AMD64 is defined, _M_X64 is also defined.

The long answer — Part III

"The bad news"

In the end, after searching to find anything, perhaps even long forgotten material, there is no evidence that Windows has supported or ever will support coding for an X32 ABI like Linux. The preprocessor macros don't help in identifying X32, the command line options and IDE options don't exist, and the strings identifying such a compiler don't exist.

The long answer — A new hope dashed

"These aren't the macros you're looking for"

One could hypothetically use the currently existing macros to check, but it's not like it helps in this case because X32 for Windows doesn't exist. It's not unlike the GLIBC check, though instead of enabling X32 if __ILP32__ is defined, you enable it if _M_X64 is not defined.

#ifdef _M_AMD64
# ifndef _M_X64
#  define ABI_STR "X32"
# else
#  define ABI_STR "AMD64"
# endif
#elif defined _M_IX86
# define ABI_STR "X86"
#else
# error unsupported CPU/architecture
#endif

Of course, if _M_AMD64 is defined, then _M_X64 is defined too, further reinforcing the evidence that there is no X32 for Windows.

回答2:

Does Microsoft actually support X32 (as opposed to X86 and X64)?

No.

回答3:

Windows doesn't have an x32 ABI. However it has a feature that gives you memory only in the low 2GB of address space. Just disable the /LARGEADDRESSAWARE flag (by default it's enabled for 64-bit binaries) and then you can use 32-bit pointers inside your 64-bit application

User space pointers in those binaries will have the top bits zeroed, so it's essentially just similar to x32 ABI on Linux. long in Windows has always been a 32-bit type, thus it's also the same as in x32 ABI where long and pointers are 32-bit wide

By default, 64-bit Microsoft Windows-based applications have a user-mode address space of several terabytes. For precise values, see Memory Limits for Windows and Windows Server Releases. However, applications can specify that the system should allocate all memory for the application below 2 gigabytes. This feature is beneficial for 64-bit applications if the following conditions are true:

A 2 GB address space is sufficient.

The code has many pointer truncation warnings.

Pointers and integers are freely mixed.

The code has polymorphism using 32-bit data types.

All pointers are still 64-bit pointers, but the system ensures that every memory allocation occurs below the 2 GB limit, so that if the application truncates a pointer, no significant data is lost. Pointers can be truncated to 32-bit values, then extended to 64-bit values by either sign extension or zero extension.

Virtual Address Space

But nowadays even on Linux kernel developers are discussing to drop x32 Support

回答4:

Sorry about the late answer (and the injustice to David).

I was reading on ml64.exe at MASM for x64 (ml64.exe), and I came across 32-Bit Address Mode in the assembler. It provides the X32 address size overrides.

So it appears Windows tools do provide an X32 related support. It also explains how Intel can produce X32 binaries and drivers. I'm just speculating, but I suspect Intel is probably using a custom allocator or VirtualAlloc to ensure pointer addresses are in a certain range.

It also appears that the Windows operating system does not have a custom built kernel, like say Debian 8, where its provided ground-up from the OS. That is, its up to the developer to ensure integers, longs and pointers are also within a 32-bit range.

回答5:

Small footnote to phuclv's answer regarding disabling the /LARGEADDRESSAWARE for given process: In certain cases, when data structures are favorable, and one takes steps necessary to actually use 32-bit pointers in 64-bit mode, there is too potential for performance gains on Windows, as it is on Linux, albeit not as large. See: Benchmark of 32-bit pointers in 64-bit code on Windows

来源：https://stackoverflow.com/questions/32675300/how-to-detect-x32-on-windows

标签

How to detect X32 on Windows?

眉间皱痕 — Fri, 06 Dec 2019 07:51:14 +0000

How to detect X32 on Windows? 眉间皱痕 2019-12-06 15:51:14

Microsoft's documentation on Predefined Macros lists the usual suspect, like _M_X64 and _M_AMD64. But it does not appear to discuss an architecture option for X32.

If Microsoft supports X32, then I suspect it is going to be an option similar to large address space aware or terminal service aware.

Does Microsoft actually support X32 (as opposed to X86 and X64)?

If so, how can I determine when X32 is being selected under Windows?
If not, then why does Intel specifically call out the X32 platform for Windows?

The question

Does Microsoft actually support X32 (as opposed to X86 and X64)?

TL;DR answer

The long answer — Part I

"There are no header strings for X32"

Disregarding the following facts:

no official documentation of such a feature exists,
no option in Visual Studio or cl.exe /? to enable/disable it exists, and
strings -el clui.dll shows no sign of such an option,

strings -el "%VCINSTALLDIR%\bin\1033\clui.dll" | find "Microsoft (R)" shows no sign of a matching header string either:

4Microsoft (R) C/C++ Optimizing Compiler Version %s
-for Microsoft (R) .NET Framework version %s
(Microsoft (R) C/C++ Optimizing Compiler
FMicrosoft (R) C/C++ Optimizing Compiler Version %s for MIPS R-Series
)Microsoft (R) MIPS Assembler Version %s
CMicrosoft (R) C/C++ Optimizing Compiler Version %s for Renesas SH
<Microsoft (R) C/C++ Optimizing Compiler Version %s for ARM
:Microsoft (R) C/C++ Standard Compiler Version %s for x86
<Microsoft (R) C/C++ Optimizing Compiler Version %s for x86
GMicrosoft (R) 32-bit C/C++ Optimizing Compiler Version %s for PowerPC
@Microsoft (R) C/C++ Optimizing Compiler Version %s for Itanium
<Microsoft (R) C/C++ Optimizing Compiler Version %s for x64
>Microsoft (R) C/C++ Optimizing Compiler Version %s for ARM64
Microsoft (R) MIPS Assembler

The same output is seen in the bin\x86_amd64\1033\clui.dll and bin\x86_arm\1033\clui.dll files, so it's not like that one file simply didn't include it.

The long answer — Part II

"Windows doesn't do data models"

#ifdef __x86_64__
# ifdef __ILP32__

// Use X32 version of myfunc().
extern long myfunc_x32 (const char *);
long (*myfunc)(const char *) = myfunc_x32;

# else /* !__ILP32__ */

// Use AMD64 version of myfunc().
extern long myfunc_amd64 (const char *);
long (*myfunc)(const char *) = myfunc_amd64;

# endif /* __ILP32__ */

/* !__x86_64__ */
#elif defined __i386__

// Use x86 version of myfunc().
extern long myfunc_x86 (const char *);
long (*myfunc)(const char *) = myfunc_x86;

/* !__i386__ */
#else

// Use generic version of myfunc() since no optimized versions are available.
long myfunc(const char *);

#endif /* __x86_64__ */

However, there is no macro indicating the data model on Windows. You target one of the following architectures:

32-bit x86 (_M_IX86)
64-bit AMD64 (_M_AMD64/_M_X64)
(32-bit?) ARM (_M_ARM)

Theoretically one could use _M_AMD64 and _M_X64 independently to determine whether X32 exists, but if _M_AMD64 is defined, _M_X64 is also defined.

The long answer — Part III

"The bad news"

The long answer — A new hope dashed

"These aren't the macros you're looking for"

#ifdef _M_AMD64
# ifndef _M_X64
#  define ABI_STR "X32"
# else
#  define ABI_STR "AMD64"
# endif
#elif defined _M_IX86
# define ABI_STR "X86"
#else
# error unsupported CPU/architecture
#endif

Of course, if _M_AMD64 is defined, then _M_X64 is defined too, further reinforcing the evidence that there is no X32 for Windows.

Does Microsoft actually support X32 (as opposed to X86 and X64)?

No.

Windows doesn't have an x32 ABI. However it has a feature that gives you memory only in the low 2GB of address space. Just disable the /LARGEADDRESSAWARE flag (by default it's enabled for 64-bit binaries) and then you can use 32-bit pointers inside your 64-bit application

By default, 64-bit Microsoft Windows-based applications have a user-mode address space of several terabytes. For precise values, see Memory Limits for Windows and Windows Server Releases. However, applications can specify that the system should allocate all memory for the application below 2 gigabytes. This feature is beneficial for 64-bit applications if the following conditions are true:

A 2 GB address space is sufficient.

The code has many pointer truncation warnings.

Pointers and integers are freely mixed.

The code has polymorphism using 32-bit data types.

All pointers are still 64-bit pointers, but the system ensures that every memory allocation occurs below the 2 GB limit, so that if the application truncates a pointer, no significant data is lost. Pointers can be truncated to 32-bit values, then extended to 64-bit values by either sign extension or zero extension.

Virtual Address Space

But nowadays even on Linux kernel developers are discussing to drop x32 Support

Sorry about the late answer (and the injustice to David).

I was reading on ml64.exe at MASM for x64 (ml64.exe), and I came across 32-Bit Address Mode in the assembler. It provides the X32 address size overrides.

来源：https://stackoverflow.com/questions/32675300/how-to-detect-x32-on-windows

标签