/*
* Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
* Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
* Copyright (c) 1999-2003 by Hewlett-Packard Company. All rights reserved.
*
*
* THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
* OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
*
* Permission is hereby granted to use or copy this program
* for any purpose, provided the above notices are retained on all copies.
* Permission to modify the code and to distribute modified code is granted,
* provided the above notices are retained, and a notice that the code was
* modified is included with the above copyright notice.
*
*/

#if (AO_GNUC_PREREQ(5, 4) || AO_CLANG_PREREQ(8, 0)) && defined(__s390x__) \
   && !defined(AO_DISABLE_GCC_ATOMICS)
 /* Probably, it could be enabled for earlier clang/gcc versions.      */
 /* But, e.g., clang-3.8.0 produces a backend error for AtomicFence.   */

# include "generic.h"

#else /* AO_DISABLE_GCC_ATOMICS */

/* The relevant documentation appears to be at                  */
/* http://publibz.boulder.ibm.com/epubs/pdf/dz9zr003.pdf        */
/* around page 5-96.  Apparently:                               */
/* - Memory references in general are atomic only for a single  */
/*   byte.  But it appears that the most common load/store      */
/*   instructions also guarantee atomicity for aligned          */
/*   operands of standard types.  WE FOOLISHLY ASSUME that      */
/*   compilers only generate those.  If that turns out to be    */
/*   wrong, we need inline assembly code for AO_load and        */
/*   AO_store.                                                  */
/* - A store followed by a load is unordered since the store    */
/*   may be delayed.  Otherwise everything is ordered.          */
/* - There is a hardware compare-and-swap (CS) instruction.     */

#include "../all_aligned_atomic_load_store.h"

#include "../ordered_except_wr.h"

#include "../test_and_set_t_is_ao_t.h"
/* TODO: Is there a way to do byte-sized test-and-set? */

/* TODO: AO_nop_full should probably be implemented directly.   */
/* It appears that certain BCR instructions have that effect.   */
/* Presumably they're cheaper than CS?                          */

#ifndef AO_GENERALIZE_ASM_BOOL_CAS
AO_INLINE int AO_compare_and_swap_full(volatile AO_t *addr,
                                      AO_t old, AO_t new_val)
{
 int retval;
 __asm__ __volatile__ (
# ifndef __s390x__
   "     cs  %1,%2,0(%3)\n"
# else
   "     csg %1,%2,0(%3)\n"
# endif
 "     ipm %0\n"
 "     srl %0,28\n"
 : "=&d" (retval), "+d" (old)
 : "d" (new_val), "a" (addr)
 : "cc", "memory");
 return retval == 0;
}
#define AO_HAVE_compare_and_swap_full
#endif /* !AO_GENERALIZE_ASM_BOOL_CAS */

AO_INLINE AO_t
AO_fetch_compare_and_swap_full(volatile AO_t *addr,
                              AO_t old, AO_t new_val)
{
 __asm__ __volatile__ (
#   ifndef __s390x__
     "     cs %0,%2,%1\n"
#   else
     "     csg %0,%2,%1\n"
#   endif
   : "+d" (old), "=Q" (*addr)
   : "d" (new_val), "m" (*addr)
   : "cc", "memory");
 return old;
}
#define AO_HAVE_fetch_compare_and_swap_full

#endif /* AO_DISABLE_GCC_ATOMICS */

/* TODO: Add double-wide operations for 32-bit executables.       */