#ifndef _URCU_ARCH_UATOMIC_X86_H
#define _URCU_ARCH_UATOMIC_X86_H

/*
 * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
 * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
 * Copyright (c) 1999-2004 Hewlett-Packard Development Company, L.P.
 * Copyright (c) 2009      Mathieu Desnoyers
 *
 * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
 * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
 *
 * Permission is hereby granted to use or copy this program
 * for any purpose,  provided the above notices are retained on all copies.
 * Permission to modify the code and to distribute modified code is granted,
 * provided the above notices are retained, and a notice that the code was
 * modified is included with the above copyright notice.
 *
 * Code inspired from libuatomic_ops-1.2, inherited in part from the
 * Boehm-Demers-Weiser conservative garbage collector.
 */

#include <urcu/compiler.h>
#include <urcu/system.h>

#define UATOMIC_HAS_ATOMIC_BYTE
#define UATOMIC_HAS_ATOMIC_SHORT

#ifdef __cplusplus
extern "C" {
#endif

/*
 * Derived from AO_compare_and_swap() and AO_test_and_set_full().
 */

struct __uatomic_dummy {
	unsigned long v[10];
};
#define __hp(x)	((struct __uatomic_dummy *)(x))

#define _uatomic_set(addr, v)	((void) CMM_STORE_SHARED(*(addr), (v)))

/* cmpxchg */

static inline __attribute__((always_inline))
unsigned long __uatomic_cmpxchg(void *addr, unsigned long old,
			      unsigned long _new, int len)
{
	switch (len) {
	case 1:
	{
		unsigned char result = old;

		__asm__ __volatile__(
		"lock; cmpxchgb %2, %1"
			: "+a"(result), "+m"(*__hp(addr))
			: "q"((unsigned char)_new)
			: "memory");
		return result;
	}
	case 2:
	{
		unsigned short result = old;

		__asm__ __volatile__(
		"lock; cmpxchgw %2, %1"
			: "+a"(result), "+m"(*__hp(addr))
			: "r"((unsigned short)_new)
			: "memory");
		return result;
	}
	case 4:
	{
		unsigned int result = old;

		__asm__ __volatile__(
		"lock; cmpxchgl %2, %1"
			: "+a"(result), "+m"(*__hp(addr))
			: "r"((unsigned int)_new)
			: "memory");
		return result;
	}
#if (CAA_BITS_PER_LONG == 64)
	case 8:
	{
		unsigned long result = old;

		__asm__ __volatile__(
		"lock; cmpxchgq %2, %1"
			: "+a"(result), "+m"(*__hp(addr))
			: "r"((unsigned long)_new)
			: "memory");
		return result;
	}
#endif
	}
	/*
	 * generate an illegal instruction. Cannot catch this with
	 * linker tricks when optimizations are disabled.
	 */
	__asm__ __volatile__("ud2");
	return 0;
}

#define _uatomic_cmpxchg(addr, old, _new)				      \
	((__typeof__(*(addr))) __uatomic_cmpxchg((addr),		      \
						caa_cast_long_keep_sign(old), \
						caa_cast_long_keep_sign(_new),\
						sizeof(*(addr))))

/* xchg */

static inline __attribute__((always_inline))
unsigned long __uatomic_exchange(void *addr, unsigned long val, int len)
{
	/* Note: the "xchg" instruction does not need a "lock" prefix. */
	switch (len) {
	case 1:
	{
		unsigned char result;
		__asm__ __volatile__(
		"xchgb %0, %1"
			: "=q"(result), "+m"(*__hp(addr))
			: "0" ((unsigned char)val)
			: "memory");
		return result;
	}
	case 2:
	{
		unsigned short result;
		__asm__ __volatile__(
		"xchgw %0, %1"
			: "=r"(result), "+m"(*__hp(addr))
			: "0" ((unsigned short)val)
			: "memory");
		return result;
	}
	case 4:
	{
		unsigned int result;
		__asm__ __volatile__(
		"xchgl %0, %1"
			: "=r"(result), "+m"(*__hp(addr))
			: "0" ((unsigned int)val)
			: "memory");
		return result;
	}
#if (CAA_BITS_PER_LONG == 64)
	case 8:
	{
		unsigned long result;
		__asm__ __volatile__(
		"xchgq %0, %1"
			: "=r"(result), "+m"(*__hp(addr))
			: "0" ((unsigned long)val)
			: "memory");
		return result;
	}
#endif
	}
	/*
	 * generate an illegal instruction. Cannot catch this with
	 * linker tricks when optimizations are disabled.
	 */
	__asm__ __volatile__("ud2");
	return 0;
}

#define _uatomic_xchg(addr, v)						      \
	((__typeof__(*(addr))) __uatomic_exchange((addr),		      \
						caa_cast_long_keep_sign(v),   \
						sizeof(*(addr))))

/* uatomic_add_return */

static inline __attribute__((always_inline))
unsigned long __uatomic_add_return(void *addr, unsigned long val,
				 int len)
{
	switch (len) {
	case 1:
	{
		unsigned char result = val;

		__asm__ __volatile__(
		"lock; xaddb %1, %0"
			: "+m"(*__hp(addr)), "+q" (result)
			:
			: "memory");
		return result + (unsigned char)val;
	}
	case 2:
	{
		unsigned short result = val;

		__asm__ __volatile__(
		"lock; xaddw %1, %0"
			: "+m"(*__hp(addr)), "+r" (result)
			:
			: "memory");
		return result + (unsigned short)val;
	}
	case 4:
	{
		unsigned int result = val;

		__asm__ __volatile__(
		"lock; xaddl %1, %0"
			: "+m"(*__hp(addr)), "+r" (result)
			:
			: "memory");
		return result + (unsigned int)val;
	}
#if (CAA_BITS_PER_LONG == 64)
	case 8:
	{
		unsigned long result = val;

		__asm__ __volatile__(
		"lock; xaddq %1, %0"
			: "+m"(*__hp(addr)), "+r" (result)
			:
			: "memory");
		return result + (unsigned long)val;
	}
#endif
	}
	/*
	 * generate an illegal instruction. Cannot catch this with
	 * linker tricks when optimizations are disabled.
	 */
	__asm__ __volatile__("ud2");
	return 0;
}

#define _uatomic_add_return(addr, v)					    \
	((__typeof__(*(addr))) __uatomic_add_return((addr),		    \
						caa_cast_long_keep_sign(v), \
						sizeof(*(addr))))

/* uatomic_and */

static inline __attribute__((always_inline))
void __uatomic_and(void *addr, unsigned long val, int len)
{
	switch (len) {
	case 1:
	{
		__asm__ __volatile__(
		"lock; andb %1, %0"
			: "=m"(*__hp(addr))
			: "iq" ((unsigned char)val)
			: "memory");
		return;
	}
	case 2:
	{
		__asm__ __volatile__(
		"lock; andw %1, %0"
			: "=m"(*__hp(addr))
			: "ir" ((unsigned short)val)
			: "memory");
		return;
	}
	case 4:
	{
		__asm__ __volatile__(
		"lock; andl %1, %0"
			: "=m"(*__hp(addr))
			: "ir" ((unsigned int)val)
			: "memory");
		return;
	}
#if (CAA_BITS_PER_LONG == 64)
	case 8:
	{
		__asm__ __volatile__(
		"lock; andq %1, %0"
			: "=m"(*__hp(addr))
			: "er" ((unsigned long)val)
			: "memory");
		return;
	}
#endif
	}
	/*
	 * generate an illegal instruction. Cannot catch this with
	 * linker tricks when optimizations are disabled.
	 */
	__asm__ __volatile__("ud2");
	return;
}

#define _uatomic_and(addr, v)						   \
	(__uatomic_and((addr), caa_cast_long_keep_sign(v), sizeof(*(addr))))

/* uatomic_or */

static inline __attribute__((always_inline))
void __uatomic_or(void *addr, unsigned long val, int len)
{
	switch (len) {
	case 1:
	{
		__asm__ __volatile__(
		"lock; orb %1, %0"
			: "=m"(*__hp(addr))
			: "iq" ((unsigned char)val)
			: "memory");
		return;
	}
	case 2:
	{
		__asm__ __volatile__(
		"lock; orw %1, %0"
			: "=m"(*__hp(addr))
			: "ir" ((unsigned short)val)
			: "memory");
		return;
	}
	case 4:
	{
		__asm__ __volatile__(
		"lock; orl %1, %0"
			: "=m"(*__hp(addr))
			: "ir" ((unsigned int)val)
			: "memory");
		return;
	}
#if (CAA_BITS_PER_LONG == 64)
	case 8:
	{
		__asm__ __volatile__(
		"lock; orq %1, %0"
			: "=m"(*__hp(addr))
			: "er" ((unsigned long)val)
			: "memory");
		return;
	}
#endif
	}
	/*
	 * generate an illegal instruction. Cannot catch this with
	 * linker tricks when optimizations are disabled.
	 */
	__asm__ __volatile__("ud2");
	return;
}

#define _uatomic_or(addr, v)						   \
	(__uatomic_or((addr), caa_cast_long_keep_sign(v), sizeof(*(addr))))

/* uatomic_add */

static inline __attribute__((always_inline))
void __uatomic_add(void *addr, unsigned long val, int len)
{
	switch (len) {
	case 1:
	{
		__asm__ __volatile__(
		"lock; addb %1, %0"
			: "=m"(*__hp(addr))
			: "iq" ((unsigned char)val)
			: "memory");
		return;
	}
	case 2:
	{
		__asm__ __volatile__(
		"lock; addw %1, %0"
			: "=m"(*__hp(addr))
			: "ir" ((unsigned short)val)
			: "memory");
		return;
	}
	case 4:
	{
		__asm__ __volatile__(
		"lock; addl %1, %0"
			: "=m"(*__hp(addr))
			: "ir" ((unsigned int)val)
			: "memory");
		return;
	}
#if (CAA_BITS_PER_LONG == 64)
	case 8:
	{
		__asm__ __volatile__(
		"lock; addq %1, %0"
			: "=m"(*__hp(addr))
			: "er" ((unsigned long)val)
			: "memory");
		return;
	}
#endif
	}
	/*
	 * generate an illegal instruction. Cannot catch this with
	 * linker tricks when optimizations are disabled.
	 */
	__asm__ __volatile__("ud2");
	return;
}

#define _uatomic_add(addr, v)						   \
	(__uatomic_add((addr), caa_cast_long_keep_sign(v), sizeof(*(addr))))


/* uatomic_inc */

static inline __attribute__((always_inline))
void __uatomic_inc(void *addr, int len)
{
	switch (len) {
	case 1:
	{
		__asm__ __volatile__(
		"lock; incb %0"
			: "=m"(*__hp(addr))
			:
			: "memory");
		return;
	}
	case 2:
	{
		__asm__ __volatile__(
		"lock; incw %0"
			: "=m"(*__hp(addr))
			:
			: "memory");
		return;
	}
	case 4:
	{
		__asm__ __volatile__(
		"lock; incl %0"
			: "=m"(*__hp(addr))
			:
			: "memory");
		return;
	}
#if (CAA_BITS_PER_LONG == 64)
	case 8:
	{
		__asm__ __volatile__(
		"lock; incq %0"
			: "=m"(*__hp(addr))
			:
			: "memory");
		return;
	}
#endif
	}
	/* generate an illegal instruction. Cannot catch this with linker tricks
	 * when optimizations are disabled. */
	__asm__ __volatile__("ud2");
	return;
}

#define _uatomic_inc(addr)	(__uatomic_inc((addr), sizeof(*(addr))))

/* uatomic_dec */

static inline __attribute__((always_inline))
void __uatomic_dec(void *addr, int len)
{
	switch (len) {
	case 1:
	{
		__asm__ __volatile__(
		"lock; decb %0"
			: "=m"(*__hp(addr))
			:
			: "memory");
		return;
	}
	case 2:
	{
		__asm__ __volatile__(
		"lock; decw %0"
			: "=m"(*__hp(addr))
			:
			: "memory");
		return;
	}
	case 4:
	{
		__asm__ __volatile__(
		"lock; decl %0"
			: "=m"(*__hp(addr))
			:
			: "memory");
		return;
	}
#if (CAA_BITS_PER_LONG == 64)
	case 8:
	{
		__asm__ __volatile__(
		"lock; decq %0"
			: "=m"(*__hp(addr))
			:
			: "memory");
		return;
	}
#endif
	}
	/*
	 * generate an illegal instruction. Cannot catch this with
	 * linker tricks when optimizations are disabled.
	 */
	__asm__ __volatile__("ud2");
	return;
}

#define _uatomic_dec(addr)	(__uatomic_dec((addr), sizeof(*(addr))))

#if ((CAA_BITS_PER_LONG != 64) && defined(CONFIG_RCU_COMPAT_ARCH))
extern int __rcu_cas_avail;
extern int __rcu_cas_init(void);

#define UATOMIC_COMPAT(insn)							\
	((caa_likely(__rcu_cas_avail > 0))						\
	? (_uatomic_##insn)							\
		: ((caa_unlikely(__rcu_cas_avail < 0)				\
			? ((__rcu_cas_init() > 0)				\
				? (_uatomic_##insn)				\
				: (compat_uatomic_##insn))			\
			: (compat_uatomic_##insn))))

/*
 * We leave the return value so we don't break the ABI, but remove the
 * return value from the API.
 */
extern unsigned long _compat_uatomic_set(void *addr,
					 unsigned long _new, int len);
#define compat_uatomic_set(addr, _new)				     	       \
	((void) _compat_uatomic_set((addr),				       \
				caa_cast_long_keep_sign(_new),		       \
				sizeof(*(addr))))


extern unsigned long _compat_uatomic_xchg(void *addr,
					  unsigned long _new, int len);
#define compat_uatomic_xchg(addr, _new)					       \
	((__typeof__(*(addr))) _compat_uatomic_xchg((addr),		       \
						caa_cast_long_keep_sign(_new), \
						sizeof(*(addr))))

extern unsigned long _compat_uatomic_cmpxchg(void *addr, unsigned long old,
					     unsigned long _new, int len);
#define compat_uatomic_cmpxchg(addr, old, _new)				       \
	((__typeof__(*(addr))) _compat_uatomic_cmpxchg((addr),		       \
						caa_cast_long_keep_sign(old),  \
						caa_cast_long_keep_sign(_new), \
						sizeof(*(addr))))

extern void _compat_uatomic_and(void *addr, unsigned long _new, int len);
#define compat_uatomic_and(addr, v)				       \
	(_compat_uatomic_and((addr),				       \
			caa_cast_long_keep_sign(v),		       \
			sizeof(*(addr))))

extern void _compat_uatomic_or(void *addr, unsigned long _new, int len);
#define compat_uatomic_or(addr, v)				       \
	(_compat_uatomic_or((addr),				       \
			  caa_cast_long_keep_sign(v),		       \
			  sizeof(*(addr))))

extern unsigned long _compat_uatomic_add_return(void *addr,
						unsigned long _new, int len);
#define compat_uatomic_add_return(addr, v)			            \
	((__typeof__(*(addr))) _compat_uatomic_add_return((addr),     	    \
						caa_cast_long_keep_sign(v), \
						sizeof(*(addr))))

#define compat_uatomic_add(addr, v)					       \
		((void)compat_uatomic_add_return((addr), (v)))
#define compat_uatomic_inc(addr)					       \
		(compat_uatomic_add((addr), 1))
#define compat_uatomic_dec(addr)					       \
		(compat_uatomic_add((addr), -1))

#else
#define UATOMIC_COMPAT(insn)	(_uatomic_##insn)
#endif

/* Read is atomic even in compat mode */
#define uatomic_set(addr, v)			\
		UATOMIC_COMPAT(set(addr, v))

#define uatomic_cmpxchg(addr, old, _new)	\
		UATOMIC_COMPAT(cmpxchg(addr, old, _new))
#define uatomic_xchg(addr, v)			\
		UATOMIC_COMPAT(xchg(addr, v))

#define uatomic_and(addr, v)		\
		UATOMIC_COMPAT(and(addr, v))
#define cmm_smp_mb__before_uatomic_and()	cmm_barrier()
#define cmm_smp_mb__after_uatomic_and()		cmm_barrier()

#define uatomic_or(addr, v)		\
		UATOMIC_COMPAT(or(addr, v))
#define cmm_smp_mb__before_uatomic_or()		cmm_barrier()
#define cmm_smp_mb__after_uatomic_or()		cmm_barrier()

#define uatomic_add_return(addr, v)		\
		UATOMIC_COMPAT(add_return(addr, v))

#define uatomic_add(addr, v)	UATOMIC_COMPAT(add(addr, v))
#define cmm_smp_mb__before_uatomic_add()	cmm_barrier()
#define cmm_smp_mb__after_uatomic_add()		cmm_barrier()

#define uatomic_inc(addr)	UATOMIC_COMPAT(inc(addr))
#define cmm_smp_mb__before_uatomic_inc()	cmm_barrier()
#define cmm_smp_mb__after_uatomic_inc()		cmm_barrier()

#define uatomic_dec(addr)	UATOMIC_COMPAT(dec(addr))
#define cmm_smp_mb__before_uatomic_dec()	cmm_barrier()
#define cmm_smp_mb__after_uatomic_dec()		cmm_barrier()

#ifdef __cplusplus
}
#endif

#include <urcu/uatomic/generic.h>

#endif /* _URCU_ARCH_UATOMIC_X86_H */