Skip to content

Commit

Permalink
New macro (AO_NO_ASM_XCHG) to use Win32 InterlockedExchange
Browse files Browse the repository at this point in the history
This allows (if the client defines `AO_NO_ASM_XCHG` macro) to avoid
assembly code in implementation of `AO_test_and_set_full()`,
`AO_nop_full()` and `AO_compiler_barrier()` in msftc/x86.h.

* README_win32.txt: Document `AO_NO_ASM_XCHG` macro.
* src/atomic_ops.h [!(__GNUC__ && !__INTEL_COMPILER) && !(_MSC_VER
|| __DMC__ || __BORLANDC__ || __WATCOMC__) && !(_AMD64_ || _M_X64
|| _MSC_VER>=1400) && AO_NO_ASM_XCHG] (AO_barrier_dummy): Define static
volatile variable.
* src/atomic_ops.h [!(__GNUC__ && !__INTEL_COMPILER) && !(_MSC_VER
|| __DMC__ || __BORLANDC__ || __WATCOMC__) && !(_AMD64_ || _M_X64
|| _MSC_VER>=1400) && AO_NO_ASM_XCHG] (AO_compiler_barrier): Define to
`AO_barrier_dummy++` instead of `__asm{}`.
* src/atomic_ops/sysdeps/msftc/common32_defs.h [AO_NO_ASM_XCHG]
(_InterlockedExchange): Declare intrinsic.
* src/atomic_ops/sysdeps/msftc/x86.h [AO_USE_PENTIUM4_INSTRS
&& AO_NO_ASM_XCHG] (AO_nop_full): Do not define (to `__asm{mfence}`);
add comment.
* src/atomic_ops/sysdeps/msftc/x86.h [!AO_NO_ASM_XCHG] (AO_nop_full):
Reformat comments.
* src/atomic_ops/sysdeps/msftc/x86.h [!AO_HAVE_test_and_set_full
&& AO_NO_ASM_XCHG]: Include `test_and_set_t_is_ao_t.h` instead of
`test_and_set_t_is_char.h`.
* src/atomic_ops/sysdeps/msftc/x86.h [!AO_HAVE_test_and_set_full
&& AO_NO_ASM_XCHG] (AO_test_and_set_full): Call _InterlockedExchange()
instead of using `__asm{...}`.
  • Loading branch information
ivmai committed Dec 6, 2024
1 parent 1159140 commit 51c78e1
Show file tree
Hide file tree
Showing 4 changed files with 61 additions and 36 deletions.
2 changes: 2 additions & 0 deletions README_win32.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ rarely needed in practice):
* AO_ASM_X64_AVAILABLE - inline assembly available (only x86_64)
* AO_NO_ASM_XADD - do not define asm-based AO_char_fetch_and_add_full and
AO_short_fetch_and_add_full primitives (x86 only)
* AO_NO_ASM_XCHG - use Win32 _InterlockedExchange primitive to implement
test-and-set and AO_nop_full operations (x86 only)
* AO_ASSUME_VISTA - assume Windows Server 2003, Vista or later target (only
x86, implied if Visual Studio 2015 or older)
* AO_CMPXCHG16B_AVAILABLE - assume target is not old AMD Opteron chip (only
Expand Down
3 changes: 3 additions & 0 deletions src/atomic_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,9 @@ struct AO_uintptr_t_size_static_assert {
# define AO_compiler_barrier() _ReadWriteBarrier()
/* We assume this does not generate a fence instruction. */
/* The documentation is a bit unclear. */
# elif defined(AO_NO_ASM_XCHG)
static volatile int AO_barrier_dummy;
# define AO_compiler_barrier() (void)(AO_barrier_dummy++)
# else
# define AO_compiler_barrier() __asm { }
/* The preceding implementation may be preferable here too. */
Expand Down
30 changes: 20 additions & 10 deletions src/atomic_ops/sysdeps/msftc/common32_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@
# define _InterlockedDecrement InterlockedDecrement
# define _InterlockedExchangeAdd InterlockedExchangeAdd
# define _InterlockedCompareExchange InterlockedCompareExchange
# ifdef AO_NO_ASM_XCHG
# define _InterlockedExchange InterlockedExchange
# endif

# define AO_INTERLOCKED_VOLATILE /**/

Expand All @@ -61,19 +64,26 @@
# endif

# else /* elif _MSC_VER < 1400 */
# ifdef __cplusplus
extern "C" {
# endif
LONG __cdecl _InterlockedIncrement(LONG volatile *);
LONG __cdecl _InterlockedDecrement(LONG volatile *);
LONG __cdecl _InterlockedExchangeAdd(LONG volatile *, LONG);
LONG __cdecl _InterlockedCompareExchange(LONG volatile *,
# ifdef __cplusplus
extern "C" {
# endif
LONG __cdecl _InterlockedIncrement(LONG volatile *);
LONG __cdecl _InterlockedDecrement(LONG volatile *);
LONG __cdecl _InterlockedExchangeAdd(LONG volatile *, LONG);
LONG __cdecl _InterlockedCompareExchange(LONG volatile *,
LONG /* Exchange */, LONG /* Comp */);
# ifdef __cplusplus
} /* extern "C" */
# endif
# ifdef AO_NO_ASM_XCHG
LONG __cdecl _InterlockedExchange(LONG volatile *, LONG);
# endif
# ifdef __cplusplus
} /* extern "C" */
# endif
# endif /* _MSC_VER < 1400 */

# ifdef AO_NO_ASM_XCHG
# pragma intrinsic (_InterlockedExchange)
# endif

# if !defined(AO_PREFER_GENERALIZED) || !defined(AO_ASSUME_WINDOWS98)
# pragma intrinsic (_InterlockedIncrement)
# pragma intrinsic (_InterlockedDecrement)
Expand Down
62 changes: 36 additions & 26 deletions src/atomic_ops/sysdeps/msftc/x86.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,34 +47,35 @@
#define AO_T_IS_INT

#ifndef AO_USE_INTERLOCKED_INTRINSICS
/* _Interlocked primitives (Inc, Dec, Xchg, Add) are always available */
/* Interlocked primitives (Inc, Dec, Xchg, Add) are always available. */
# define AO_USE_INTERLOCKED_INTRINSICS
#endif
#include "common32_defs.h"

/* As far as we can tell, the lfence and sfence instructions are not */
/* currently needed or useful for cached memory accesses. */

/* Unfortunately mfence doesn't exist everywhere. */
/* IsProcessorFeaturePresent(PF_COMPARE_EXCHANGE128) is */
/* probably a conservative test for it? */

#if defined(AO_USE_PENTIUM4_INSTRS)

AO_INLINE void
AO_nop_full(void)
{
__asm { mfence }
}
#define AO_HAVE_nop_full

#ifdef AO_NO_ASM_XCHG
/* Use the default implementation based on test_and_set_full. */
#else

/* We could use the cpuid instruction. But that seems to be slower */
/* than the default implementation based on test_and_set_full. Thus */
/* we omit that bit of misinformation here. */
/* As far as we can tell, the lfence and sfence instructions */
/* are not currently needed or useful for cached memory */
/* accesses. Unfortunately mfence doesn't exist everywhere; */
/* IsProcessorFeaturePresent(PF_COMPARE_EXCHANGE128) is */
/* probably a conservative test for it. */

#endif
# ifdef AO_USE_PENTIUM4_INSTRS
AO_INLINE void
AO_nop_full(void)
{
__asm { mfence }
}
# define AO_HAVE_nop_full
# else
/* We could use the cpuid instruction. But that seems to */
/* be slower than the default implementation based on */
/* test_and_set_full. Thus we omit that bit of */
/* misinformation here. */
# endif
#endif /* !AO_NO_ASM_XCHG */

#if !defined(AO_NO_ASM_XADD) && !defined(AO_HAVE_char_fetch_and_add_full)
AO_INLINE unsigned char
Expand Down Expand Up @@ -105,18 +106,27 @@ AO_nop_full(void)
#endif /* !AO_NO_ASM_XADD */

#ifndef AO_HAVE_test_and_set_full
# include "../test_and_set_t_is_char.h"
# ifdef AO_NO_ASM_XCHG
# include "../test_and_set_t_is_ao_t.h"
# else
# include "../test_and_set_t_is_char.h"
# endif

AO_INLINE AO_TS_VAL_t
AO_test_and_set_full(volatile AO_TS_t *addr)
{
__asm
{
# ifdef AO_NO_ASM_XCHG
return _InterlockedExchange((long AO_INTERLOCKED_VOLATILE *)addr,
AO_TS_SET);
# else
__asm
{
mov eax,0xff ; /* AO_TS_SET */
mov ebx,addr ;
xchg byte ptr [ebx],al ;
}
/* Ignore possible "missing return value" warning here. */
}
/* Ignore possible "missing return value" warning here. */
# endif
}
# define AO_HAVE_test_and_set_full
#endif
Expand Down

0 comments on commit 51c78e1

Please sign in to comment.