WIP: integrated AO but not quite right yet.

2013-07-08 13:34:42 -04:00 · 2013-07-08 13:34:42 -04:00 · aedb91aca5
commit aedb91aca5
parent 565f95f9b5
22 changed files with 5614 additions and 198 deletions
--- a/c_src/atomic.h
+++ b/c_src/atomic.h
@ -0,0 +1,100 @@
+/*
+ * File:
+ *   atomic.h
+ * Author(s):
+ *   Pascal Felber <pascal.felber@unine.ch>
+ *   Patrick Marlier <patrick.marlier@unine.ch>
+ * Description:
+ *   Atomic operations.
+ *
+ * Copyright (c) 2007-2012.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation, version 2
+ * of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * This program has a dual license and can also be distributed
+ * under the terms of the MIT license.
+ */
+
+#ifndef _ATOMIC_H_
+# define _ATOMIC_H_
+
+# ifdef ATOMIC_BUILTIN
+typedef volatile size_t atomic_t;
+#  ifdef __INTEL_COMPILER
+#   define ATOMIC_CB                     __memory_barrier()
+#  else /* ! __INTEL_COMPILER, assuming __GNUC__ */
+#   define ATOMIC_CB                     __asm__ __volatile__("": : :"memory")
+#  endif /* ! __INTEL_COMPILER */
+#  ifndef UNSAFE
+#   warning "This is experimental and shouldn't be used"
+/*
+   Note: __sync_ is available for GCC 4.2+ and ICC 11.1+
+   But these definitions are not 100% safe:
+    * need 'a' to be volatile
+    * no fence for read/store proposed (only full fence)
+   C11 and C++11 also propose atomic operations.
+*/
+#   define ATOMIC_CAS_FULL(a, e, v)      (__sync_bool_compare_and_swap(a, e, v))
+#   define ATOMIC_FETCH_INC_FULL(a)      (__sync_fetch_and_add(a, 1))
+#   define ATOMIC_FETCH_DEC_FULL(a)      (__sync_fetch_and_add(a, -1))
+#   define ATOMIC_FETCH_ADD_FULL(a, v)   (__sync_fetch_and_add(a, v))
+#   define ATOMIC_LOAD_ACQ(a)            (*(a))
+#   define ATOMIC_LOAD(a)                (*(a))
+#   define ATOMIC_STORE_REL(a, v)        (*(a) = (v))
+#   define ATOMIC_STORE(a, v)            (*(a) = (v))
+#   define ATOMIC_MB_READ                /* Nothing */
+#   define ATOMIC_MB_WRITE               /* Nothing */
+#   define ATOMIC_MB_FULL                __sync_synchronize()
+#  else
+/* Use only for testing purposes (single thread benchmarks) */
+#   define ATOMIC_CAS_FULL(a, e, v)      (*(a) = (v), 1)
+#   define ATOMIC_FETCH_INC_FULL(a)      ((*(a))++)
+#   define ATOMIC_FETCH_DEC_FULL(a)      ((*(a))--)
+#   define ATOMIC_FETCH_ADD_FULL(a, v)   ((*(a)) += (v))
+#   define ATOMIC_LOAD_ACQ(a)            (*(a))
+#   define ATOMIC_LOAD(a)                (*(a))
+#   define ATOMIC_STORE_REL(a, v)        (*(a) = (v))
+#   define ATOMIC_STORE(a, v)            (*(a) = (v))
+#   define ATOMIC_MB_READ                /* Nothing */
+#   define ATOMIC_MB_WRITE               /* Nothing */
+#   define ATOMIC_MB_FULL                /* Nothing */
+#  endif /* UNSAFE */
+
+# else /* ! ATOMIC_BUILTIN */
+/* NOTE: enable fence instructions for i386 and amd64 but the mfence instructions seems costly. */
+/* # define AO_USE_PENTIUM4_INSTRS */
+#  include "atomic_ops/atomic_ops.h"
+typedef AO_t atomic_t;
+#  define ATOMIC_CB                     AO_compiler_barrier()
+#  define ATOMIC_CAS_FULL(a, e, v)      (AO_compare_and_swap_full((volatile AO_t *)(a), (AO_t)(e), (AO_t)(v)))
+#  define ATOMIC_FETCH_INC_FULL(a)      (AO_fetch_and_add1_full((volatile AO_t *)(a)))
+#  define ATOMIC_FETCH_DEC_FULL(a)      (AO_fetch_and_sub1_full((volatile AO_t *)(a)))
+#  define ATOMIC_FETCH_ADD_FULL(a, v)   (AO_fetch_and_add_full((volatile AO_t *)(a), (AO_t)(v)))
+#  ifdef SAFE
+#   define ATOMIC_LOAD_ACQ(a)           (AO_load_full((volatile AO_t *)(a)))
+#   define ATOMIC_LOAD(a)               (AO_load_full((volatile AO_t *)(a)))
+#   define ATOMIC_STORE_REL(a, v)       (AO_store_full((volatile AO_t *)(a), (AO_t)(v)))
+#   define ATOMIC_STORE(a, v)           (AO_store_full((volatile AO_t *)(a), (AO_t)(v)))
+#   define ATOMIC_MB_READ               AO_nop_full()
+#   define ATOMIC_MB_WRITE              AO_nop_full()
+#   define ATOMIC_MB_FULL               AO_nop_full()
+#  else /* ! SAFE */
+#   define ATOMIC_LOAD_ACQ(a)           (AO_load_acquire_read((volatile AO_t *)(a)))
+#   define ATOMIC_LOAD(a)               (*((volatile AO_t *)(a)))
+#   define ATOMIC_STORE_REL(a, v)       (AO_store_release((volatile AO_t *)(a), (AO_t)(v)))
+#   define ATOMIC_STORE(a, v)           (*((volatile AO_t *)(a)) = (AO_t)(v))
+#   define ATOMIC_MB_READ               AO_nop_read()
+#   define ATOMIC_MB_WRITE              AO_nop_write()
+#   define ATOMIC_MB_FULL               AO_nop_full()
+#  endif /* ! SAFE */
+# endif /* ! NO_AO */
+
+#endif /* _ATOMIC_H_ */
--- a/c_src/atomic_ops/AUTHORS
+++ b/c_src/atomic_ops/AUTHORS
@ -0,0 +1,4 @@
+Originally written by Hans Boehm, with some platform-dependent code
+imported from the Boehm-Demers-Weiser GC, where it was contributed
+by many others.
+
--- a/c_src/atomic_ops/COPYING
+++ b/c_src/atomic_ops/COPYING
@ -0,0 +1,340 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year  name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
--- a/c_src/atomic_ops/README
+++ b/c_src/atomic_ops/README
@ -0,0 +1,2 @@
+This directory contains a stripped-down (support only gcc) version of libatomic_ops by Hans Boehm.
+The official release is available from http://www.hpl.hp.com/research/linux/atomic_ops/.
--- a/c_src/atomic_ops/aligned_atomic_load_store.h
+++ b/c_src/atomic_ops/aligned_atomic_load_store.h
@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2003 Hewlett-Packard Development Company, L.P.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Definitions for architectures on which loads and stores of AO_t are
+ * atomic fo all legal alignments.
+ */
+
+AO_INLINE AO_t
+AO_load(const volatile AO_t *addr)
+{
+  assert(((size_t)addr & (sizeof(AO_t) - 1)) == 0);
+  /* Cast away the volatile for architectures where             */
+  /* volatile adds barrier semantics.                           */
+  return *(AO_t *)addr;
+}
+
+#define AO_HAVE_load
+
+AO_INLINE void
+AO_store(volatile AO_t *addr, AO_t new_val)
+{
+  assert(((size_t)addr & (sizeof(AO_t) - 1)) == 0);
+  (*(AO_t *)addr) = new_val;
+}
+
+#define AO_HAVE_store
--- a/c_src/atomic_ops/all_acquire_release_volatile.h
+++ b/c_src/atomic_ops/all_acquire_release_volatile.h
@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2004 Hewlett-Packard Development Company, L.P.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. 
+ */
+
+/*
+ * Describes architectures on which volatile AO_t, unsigned char, unsigned
+ * short, and unsigned int loads and stores have acquire/release semantics for
+ * all normally legal alignments.
+ */
+//#include "acquire_release_volatile.h"
+//#include "char_acquire_release_volatile.h"
+//#include "short_acquire_release_volatile.h"
+//#include "int_acquire_release_volatile.h"
+
+/*
+ * This file adds definitions appropriate for environments in which an AO_t
+ * volatile load has acquire semantics, and an AO_t volatile store has release
+ * semantics.  This is arguably supposed to be true with the standard Itanium
+ * software conventions.
+ */
+
+/*
+ * Empirically gcc/ia64 does some reordering of ordinary operations around volatiles
+ * even when we think it shouldn't.  Gcc 3.3 and earlier could reorder a volatile store
+ * with another store.  As of March 2005, gcc pre-4 reused previously computed
+ * common subexpressions across a volatile load.
+ * Hence we now add compiler barriers for gcc.
+ */
+#if !defined(AO_GCC_BARRIER)
+#  if defined(__GNUC__)
+#    define AO_GCC_BARRIER() AO_compiler_barrier()
+#  else
+#    define AO_GCC_BARRIER()
+#  endif
+#endif
+
+AO_INLINE AO_t
+AO_load_acquire(const volatile AO_t *p)
+{
+  AO_t result = *p;
+  /* A normal volatile load generates an ld.acq         */
+  AO_GCC_BARRIER();
+  return result;
+}
+#define AO_HAVE_load_acquire
+
+AO_INLINE void
+AO_store_release(volatile AO_t *p, AO_t val)
+{
+  AO_GCC_BARRIER();
+  /* A normal volatile store generates an st.rel        */
+  *p = val;
+}
+#define AO_HAVE_store_release
+
+/*
+ * This file adds definitions appropriate for environments in which an unsigned char
+ * volatile load has acquire semantics, and an unsigned char volatile store has release
+ * semantics.  This is true with the standard Itanium ABI.
+ */
+#if !defined(AO_GCC_BARRIER)
+#  if defined(__GNUC__)
+#    define AO_GCC_BARRIER() AO_compiler_barrier()
+#  else
+#    define AO_GCC_BARRIER()
+#  endif
+#endif
+
+AO_INLINE unsigned char
+AO_char_load_acquire(const volatile unsigned char *p)
+{
+  unsigned char result = *p;
+  /* A normal volatile load generates an ld.acq         */
+  AO_GCC_BARRIER();
+  return result;
+}
+#define AO_HAVE_char_load_acquire
+
+AO_INLINE void
+AO_char_store_release(volatile unsigned char *p, unsigned char val)
+{
+  AO_GCC_BARRIER();
+  /* A normal volatile store generates an st.rel        */
+  *p = val;
+}
+#define AO_HAVE_char_store_release
+
+/*
+ * This file adds definitions appropriate for environments in which an unsigned short
+ * volatile load has acquire semantics, and an unsigned short volatile store has release
+ * semantics.  This is true with the standard Itanium ABI.
+ */
+#if !defined(AO_GCC_BARRIER)
+#  if defined(__GNUC__)
+#    define AO_GCC_BARRIER() AO_compiler_barrier()
+#  else
+#    define AO_GCC_BARRIER()
+#  endif
+#endif
+
+AO_INLINE unsigned short
+AO_short_load_acquire(const volatile unsigned short *p)
+{
+  unsigned short result = *p;
+  /* A normal volatile load generates an ld.acq         */
+  AO_GCC_BARRIER();
+  return result;
+}
+#define AO_HAVE_short_load_acquire
+
+AO_INLINE void
+AO_short_store_release(volatile unsigned short *p, unsigned short val)
+{
+  AO_GCC_BARRIER();
+  /* A normal volatile store generates an st.rel        */
+  *p = val;
+}
+#define AO_HAVE_short_store_release
+
+/*
+ * This file adds definitions appropriate for environments in which an unsigned
+ * int volatile load has acquire semantics, and an unsigned short volatile
+ * store has release semantics.  This is true with the standard Itanium ABI.
+ */
+#if !defined(AO_GCC_BARRIER)
+#  if defined(__GNUC__)
+#    define AO_GCC_BARRIER() AO_compiler_barrier()
+#  else
+#    define AO_GCC_BARRIER()
+#  endif
+#endif
+
+AO_INLINE unsigned int
+AO_int_load_acquire(const volatile unsigned int *p)
+{
+  unsigned int result = *p;
+  /* A normal volatile load generates an ld.acq         */
+  AO_GCC_BARRIER();
+  return result;
+}
+#define AO_HAVE_int_load_acquire
+
+AO_INLINE void
+AO_int_store_release(volatile unsigned int *p, unsigned int val)
+{
+  AO_GCC_BARRIER();
+  /* A normal volatile store generates an st.rel        */
+  *p = val;
+}
+#define AO_HAVE_int_store_release
--- a/c_src/atomic_ops/ao_t_is_int.h
+++ b/c_src/atomic_ops/ao_t_is_int.h
@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2003-2004 Hewlett-Packard Development Company, L.P.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * Inclusion of this file signifies that AO_t is in fact int.  Hence
+ * any AO_... operations can also server as AO_int_... operations.
+ * We currently define only the more important ones here, and allow for
+ * the normal generalization process to define the others.
+ * We should probably add others in the future.
+ */
+
+#if defined(AO_HAVE_compare_and_swap_full) && \
+    !defined(AO_HAVE_int_compare_and_swap_full)
+#  define AO_int_compare_and_swap_full(addr, old, new_val) \
+                AO_compare_and_swap_full((volatile AO_t *)(addr), \
+                                        (AO_t)(old), (AO_t)(new_val))
+#  define AO_HAVE_int_compare_and_swap_full
+# endif
+
+#if defined(AO_HAVE_compare_and_swap_acquire) && \
+    !defined(AO_HAVE_int_compare_and_swap_acquire)
+#  define AO_int_compare_and_swap_acquire(addr, old, new_val) \
+                AO_compare_and_swap_acquire((volatile AO_t *)(addr), \
+                                            (AO_t)(old), (AO_t)(new_val))
+#  define AO_HAVE_int_compare_and_swap_acquire
+# endif
+
+#if defined(AO_HAVE_compare_and_swap_release) && \
+    !defined(AO_HAVE_int_compare_and_swap_release)
+#  define AO_int_compare_and_swap_release(addr, old, new_val) \
+                AO_compare_and_swap_release((volatile AO_t *)(addr), \
+                                         (AO_t)(old), (AO_t)(new_val))
+#  define AO_HAVE_int_compare_and_swap_release
+# endif
+
+#if defined(AO_HAVE_compare_and_swap_write) && \
+    !defined(AO_HAVE_int_compare_and_swap_write)
+#  define AO_int_compare_and_swap_write(addr, old, new_val) \
+                AO_compare_and_swap_write((volatile AO_t *)(addr), \
+                                          (AO_t)(old), (AO_t)(new_val))
+#  define AO_HAVE_int_compare_and_swap_write
+# endif
+
+#if defined(AO_HAVE_compare_and_swap_read) && \
+    !defined(AO_HAVE_int_compare_and_swap_read)
+#  define AO_int_compare_and_swap_read(addr, old, new_val) \
+                AO_compare_and_swap_read((volatile AO_t *)(addr), \
+                                         (AO_t)(old), (AO_t)(new_val))
+#  define AO_HAVE_int_compare_and_swap_read
+# endif
+
+#if defined(AO_HAVE_compare_and_swap) && \
+    !defined(AO_HAVE_int_compare_and_swap)
+#  define AO_int_compare_and_swap(addr, old, new_val) \
+                AO_compare_and_swap((volatile AO_t *)(addr), \
+                                    (AO_t)(old), (AO_t)(new_val))
+#  define AO_HAVE_int_compare_and_swap
+# endif
+
+#if defined(AO_HAVE_load_acquire) && \
+    !defined(AO_HAVE_int_load_acquire)
+#  define AO_int_load_acquire(addr) \
+        (int)AO_load_acquire((const volatile AO_t *)(addr))
+#  define AO_HAVE_int_load_acquire
+# endif
+
+#if defined(AO_HAVE_store_release) && \
+    !defined(AO_HAVE_int_store_release)
+#  define AO_int_store_release(addr, val) \
+        AO_store_release((volatile AO_t *)(addr), (AO_t)(val))
+#  define AO_HAVE_int_store_release
+# endif
+
+#if defined(AO_HAVE_fetch_and_add_full) && \
+    !defined(AO_HAVE_int_fetch_and_add_full)
+#  define AO_int_fetch_and_add_full(addr, incr) \
+        (int)AO_fetch_and_add_full((volatile AO_t *)(addr), (AO_t)(incr))
+#  define AO_HAVE_int_fetch_and_add_full
+# endif
+
+#if defined(AO_HAVE_fetch_and_add1_acquire) && \
+    !defined(AO_HAVE_int_fetch_and_add1_acquire)
+#  define AO_int_fetch_and_add1_acquire(addr) \
+        (int)AO_fetch_and_add1_acquire((volatile AO_t *)(addr))
+#  define AO_HAVE_int_fetch_and_add1_acquire
+# endif
+
+#if defined(AO_HAVE_fetch_and_add1_release) && \
+    !defined(AO_HAVE_int_fetch_and_add1_release)
+#  define AO_int_fetch_and_add1_release(addr) \
+        (int)AO_fetch_and_add1_release((volatile AO_t *)(addr))
+#  define AO_HAVE_int_fetch_and_add1_release
+# endif
+
+#if defined(AO_HAVE_fetch_and_sub1_acquire) && \
+    !defined(AO_HAVE_int_fetch_and_sub1_acquire)
+#  define AO_int_fetch_and_sub1_acquire(addr) \
+        (int)AO_fetch_and_sub1_acquire((volatile AO_t *)(addr))
+#  define AO_HAVE_int_fetch_and_sub1_acquire
+# endif
+
+#if defined(AO_HAVE_fetch_and_sub1_release) && \
+    !defined(AO_HAVE_int_fetch_and_sub1_release)
+#  define AO_int_fetch_and_sub1_release(addr) \
+        (int)AO_fetch_and_sub1_release((volatile AO_t *)(addr))
+#  define AO_HAVE_int_fetch_and_sub1_release
+# endif
--- a/c_src/atomic_ops/atomic_ops.h
+++ b/c_src/atomic_ops/atomic_ops.h
@ -0,0 +1,348 @@
+/*
+ * Copyright (c) 2003 Hewlett-Packard Development Company, L.P.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ATOMIC_OPS_H
+
+#define ATOMIC_OPS_H
+
+#include <assert.h>
+#include <stddef.h>
+
+/* We define various atomic operations on memory in a           */
+/* machine-specific way.  Unfortunately, this is complicated    */
+/* by the fact that these may or may not be combined with       */
+/* various memory barriers.  Thus the actual operations we      */
+/* define have the form AO_<atomic-op>_<barrier>, for all       */
+/* plausible combinations of <atomic-op> and <barrier>.         */
+/* This of course results in a mild combinatorial explosion.    */
+/* To deal with it, we try to generate derived                  */
+/* definitions for as many of the combinations as we can, as    */
+/* automatically as possible.                                   */
+/*                                                              */
+/* Our assumption throughout is that the programmer will        */
+/* specify the least demanding operation and memory barrier     */
+/* that will guarantee correctness for the implementation.      */
+/* Our job is to find the least expensive way to implement it   */
+/* on the applicable hardware.  In many cases that will         */
+/* involve, for example, a stronger memory barrier, or a        */
+/* combination of hardware primitives.                          */
+/*                                                              */
+/* Conventions:                                                 */
+/* "plain" atomic operations are not guaranteed to include      */
+/* a barrier.  The suffix in the name specifies the barrier     */
+/* type.  Suffixes are:                                         */
+/* _release: Earlier operations may not be delayed past it.     */
+/* _acquire: Later operations may not move ahead of it.         */
+/* _read: Subsequent reads must follow this operation and       */
+/*        preceding reads.                                      */
+/* _write: Earlier writes precede both this operation and       */
+/*        later writes.                                         */
+/* _full: Ordered with respect to both earlier and later memops.*/
+/* _release_write: Ordered with respect to earlier writes.      */
+/* _acquire_read: Ordered with respect to later reads.          */
+/*                                                              */
+/* Currently we try to define the following atomic memory       */
+/* operations, in combination with the above barriers:          */
+/* AO_nop                                                       */
+/* AO_load                                                      */
+/* AO_store                                                     */
+/* AO_test_and_set (binary)                                     */
+/* AO_fetch_and_add                                             */
+/* AO_fetch_and_add1                                            */
+/* AO_fetch_and_sub1                                            */
+/* AO_or                                                        */
+/* AO_compare_and_swap                                          */
+/*                                                              */
+/* Note that atomicity guarantees are valid only if both        */
+/* readers and writers use AO_ operations to access the         */
+/* shared value, while ordering constraints are intended to     */
+/* apply all memory operations.  If a location can potentially  */
+/* be accessed simultaneously from multiple threads, and one of */
+/* those accesses may be a write access, then all such          */
+/* accesses to that location should be through AO_ primitives.  */
+/* However if AO_ operations enforce sufficient ordering to     */
+/* ensure that a location x cannot be accessed concurrently,    */
+/* or can only be read concurrently, then x can be accessed     */
+/* via ordinary references and assignments.                     */
+/*                                                              */
+/* Compare_and_exchange takes an address and an expected old    */
+/* value and a new value, and returns an int.  Nonzero          */
+/* indicates that it succeeded.                                 */
+/* Test_and_set takes an address, atomically replaces it by     */
+/* AO_TS_SET, and returns the prior value.                      */
+/* An AO_TS_t location can be reset with the                    */
+/* AO_CLEAR macro, which normally uses AO_store_release.        */
+/* AO_fetch_and_add takes an address and an AO_t increment      */
+/* value.  The AO_fetch_and_add1 and AO_fetch_and_sub1 variants */
+/* are provided, since they allow faster implementations on     */
+/* some hardware. AO_or atomically ors an AO_t value into a     */
+/* memory location, but does not provide access to the original.*/
+/*                                                              */
+/* We expect this list to grow slowly over time.                */
+/*                                                              */
+/* Note that AO_nop_full is a full memory barrier.              */
+/*                                                              */
+/* Note that if some data is initialized with                   */
+/*      data.x = ...; data.y = ...; ...                         */
+/*      AO_store_release_write(&data_is_initialized, 1)         */
+/* then data is guaranteed to be initialized after the test     */
+/*      if (AO_load_release_read(&data_is_initialized)) ...     */
+/* succeeds.  Furthermore, this should generate near-optimal    */
+/* code on all common platforms.                                */
+/*                                                              */
+/* All operations operate on unsigned AO_t, which               */
+/* is the natural word size, and usually unsigned long.         */
+/* It is possible to check whether a particular operation op    */
+/* is available on a particular platform by checking whether    */
+/* AO_HAVE_op is defined.  We make heavy use of these macros    */
+/* internally.                                                  */
+
+/* The rest of this file basically has three sections:          */
+/*                                                              */
+/* Some utility and default definitions.                        */
+/*                                                              */
+/* The architecture dependent section:                          */
+/* This defines atomic operations that have direct hardware     */
+/* support on a particular platform, mostly by including the    */
+/* appropriate compiler- and hardware-dependent file.           */
+/*                                                              */
+/* The synthesis section:                                       */
+/* This tries to define other atomic operations in terms of     */
+/* those that are explicitly available on the platform.         */
+/* This section is hardware independent.                        */
+/* We make no attempt to synthesize operations in ways that     */
+/* effectively introduce locks, except for the debugging/demo   */
+/* pthread-based implementation at the beginning.  A more       */
+/* realistic implementation that falls back to locks could be   */
+/* added as a higher layer.  But that would sacrifice           */
+/* usability from signal handlers.                              */
+/* The synthesis section is implemented almost entirely in      */
+/* atomic_ops_generalize.h.                                     */
+
+/* Some common defaults.  Overridden for some architectures.    */
+#define AO_t size_t
+
+/* The test_and_set primitive returns an AO_TS_VAL_t value.     */
+/* AO_TS_t is the type of an in-memory test-and-set location.   */
+
+#define AO_TS_INITIALIZER (AO_t)AO_TS_CLEAR
+
+/* Platform-dependent stuff:                                    */
+#if defined(__GNUC__) || defined(_MSC_VER) || defined(__INTEL_COMPILER) \
+        || defined(__DMC__) || defined(__WATCOMC__)
+# define AO_INLINE static __inline
+#elif defined(__sun)
+# define AO_INLINE static inline
+#else
+# define AO_INLINE static
+#endif
+
+#if defined(__GNUC__) && !defined(__INTEL_COMPILER)
+# define AO_compiler_barrier() __asm__ __volatile__("" : : : "memory")
+#elif defined(_MSC_VER) || defined(__DMC__) || defined(__BORLANDC__) \
+        || defined(__WATCOMC__)
+# if defined(_AMD64_) || defined(_M_X64) || _MSC_VER >= 1400
+#   if defined(_WIN32_WCE)
+/* #     include <cmnintrin.h> */
+#   elif defined(_MSC_VER)
+#     include <intrin.h>
+#   endif
+#   pragma intrinsic(_ReadWriteBarrier)
+#   define AO_compiler_barrier() _ReadWriteBarrier()
+        /* We assume this does not generate a fence instruction.        */
+        /* The documentation is a bit unclear.                          */
+# else
+#   define AO_compiler_barrier() __asm { }
+        /* The preceding implementation may be preferable here too.     */
+        /* But the documentation warns about VC++ 2003 and earlier.     */
+# endif
+#elif defined(__INTEL_COMPILER)
+# define AO_compiler_barrier() __memory_barrier() /* Too strong? IA64-only? */
+#elif defined(_HPUX_SOURCE)
+# if defined(__ia64)
+#   include <machine/sys/inline.h>
+#   define AO_compiler_barrier() _Asm_sched_fence()
+# else
+    /* FIXME - We dont know how to do this.  This is a guess.   */
+    /* And probably a bad one.                                  */
+    static volatile int AO_barrier_dummy;
+#   define AO_compiler_barrier() AO_barrier_dummy = AO_barrier_dummy
+# endif
+#else
+  /* We conjecture that the following usually gives us the right        */
+  /* semantics or an error.                                             */
+# define AO_compiler_barrier() asm("")
+#endif
+
+#if defined(AO_USE_PTHREAD_DEFS)
+# include "atomic_ops/sysdeps/generic_pthread.h"
+#endif /* AO_USE_PTHREAD_DEFS */
+
+#if defined(__GNUC__) && !defined(AO_USE_PTHREAD_DEFS) \
+    && !defined(__INTEL_COMPILER)
+# if defined(__i386__)
+    /* We don't define AO_USE_SYNC_CAS_BUILTIN for x86 here because     */
+    /* it might require specifying additional options (like -march)     */
+    /* or additional link libraries (if -march is not specified).       */
+#   include "./x86.h"
+# endif /* __i386__ */
+# if defined(__x86_64__)
+#   if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)
+      /* It is safe to use __sync CAS built-in on this architecture.    */
+#     define AO_USE_SYNC_CAS_BUILTIN
+#   endif
+#   include "./x86_64.h"
+# endif /* __x86_64__ */
+# if defined(__ia64__)
+#   include "./ia64.h"
+#   define AO_GENERALIZE_TWICE
+# endif /* __ia64__ */
+# if defined(__hppa__)
+#   include "atomic_ops/sysdeps/gcc/hppa.h"
+#   define AO_CAN_EMUL_CAS
+# endif /* __hppa__ */
+# if defined(__alpha__)
+#   include "atomic_ops/sysdeps/gcc/alpha.h"
+#   define AO_GENERALIZE_TWICE
+# endif /* __alpha__ */
+# if defined(__s390__)
+#   include "atomic_ops/sysdeps/gcc/s390.h"
+# endif /* __s390__ */
+# if defined(__sparc__)
+#   include "./sparc.h"
+#   define AO_CAN_EMUL_CAS
+# endif /* __sparc__ */
+# if defined(__m68k__)
+#   include "atomic_ops/sysdeps/gcc/m68k.h"
+# endif /* __m68k__ */
+# if defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \
+     || defined(__powerpc64__) || defined(__ppc64__)
+#   include "./powerpc.h"
+# endif /* __powerpc__ */
+# if defined(__arm__) && !defined(AO_USE_PTHREAD_DEFS)
+#   include "atomic_ops/sysdeps/gcc/arm.h"
+#   define AO_CAN_EMUL_CAS
+# endif /* __arm__ */
+# if defined(__cris__) || defined(CRIS)
+#   include "atomic_ops/sysdeps/gcc/cris.h"
+# endif
+# if defined(__mips__)
+#   include "atomic_ops/sysdeps/gcc/mips.h"
+# endif /* __mips__ */
+# if defined(__sh__) || defined(SH4)
+#   include "atomic_ops/sysdeps/gcc/sh.h"
+#   define AO_CAN_EMUL_CAS
+# endif /* __sh__ */
+#endif /* __GNUC__ && !AO_USE_PTHREAD_DEFS */
+
+#if defined(__INTEL_COMPILER) && !defined(AO_USE_PTHREAD_DEFS)
+# if defined(__ia64__)
+#   include "./ia64.h"
+#   define AO_GENERALIZE_TWICE
+# endif
+# if defined(__GNUC__)
+    /* Intel Compiler in GCC compatible mode */
+#   if defined(__i386__)
+#     include "./x86.h"
+#   endif /* __i386__ */
+#   if defined(__x86_64__)
+#     if __INTEL_COMPILER > 1110
+#       define AO_USE_SYNC_CAS_BUILTIN
+#     endif
+#     include "./x86_64.h"
+#   endif /* __x86_64__ */
+# endif
+#endif
+
+#if defined(_HPUX_SOURCE) && !defined(__GNUC__) && !defined(AO_USE_PTHREAD_DEFS)
+# if defined(__ia64)
+#   include "atomic_ops/sysdeps/hpc/ia64.h"
+#   define AO_GENERALIZE_TWICE
+# else
+#   include "atomic_ops/sysdeps/hpc/hppa.h"
+#   define AO_CAN_EMUL_CAS
+# endif
+#endif
+
+#if defined(__sun) && !defined(__GNUC__) && !defined(AO_USE_PTHREAD_DEFS)
+  /* Note: use -DAO_USE_PTHREAD_DEFS if Sun CC does not handle inline asm. */
+# if defined(__i386)
+#   include "atomic_ops/sysdeps/sunc/x86.h"
+# endif /* __i386 */
+# if defined(__x86_64) || defined(__amd64)
+#   include "atomic_ops/sysdeps/sunc/x86_64.h"
+# endif /* __x86_64 */
+#endif
+
+#if !defined(__GNUC__) && (defined(sparc) || defined(__sparc)) \
+    && !defined(AO_USE_PTHREAD_DEFS)
+#   include "atomic_ops/sysdeps/sunc/sparc.h"
+#   define AO_CAN_EMUL_CAS
+#endif
+
+#if defined(_MSC_VER) || defined(__DMC__) || defined(__BORLANDC__) \
+        || (defined(__WATCOMC__) && defined(__NT__))
+# if defined(_AMD64_) || defined(_M_X64)
+#   include "atomic_ops/sysdeps/msftc/x86_64.h"
+# elif defined(_M_IX86) || defined(x86)
+#   include "atomic_ops/sysdeps/msftc/x86.h"
+# elif defined(_M_ARM) || defined(ARM) || defined(_ARM_)
+#   include "atomic_ops/sysdeps/msftc/arm.h"
+# endif
+#endif
+
+#if defined(AO_REQUIRE_CAS) && !defined(AO_HAVE_compare_and_swap) \
+    && !defined(AO_HAVE_compare_and_swap_full) \
+    && !defined(AO_HAVE_compare_and_swap_acquire)
+# if defined(AO_CAN_EMUL_CAS)
+#   include "atomic_ops/sysdeps/emul_cas.h"
+# else
+#  error Cannot implement AO_compare_and_swap_full on this architecture.
+# endif
+#endif  /* AO_REQUIRE_CAS && !AO_HAVE_compare_and_swap ... */
+
+/* The most common way to clear a test-and-set location         */
+/* at the end of a critical section.                            */
+#if AO_AO_TS_T && !defined(AO_CLEAR)
+# define AO_CLEAR(addr) AO_store_release((AO_TS_t *)(addr), AO_TS_CLEAR)
+#endif
+#if AO_CHAR_TS_T && !defined(AO_CLEAR)
+# define AO_CLEAR(addr) AO_char_store_release((AO_TS_t *)(addr), AO_TS_CLEAR)
+#endif
+
+/*
+ * The generalization section.
+ * Theoretically this should repeatedly include atomic_ops_generalize.h.
+ * In fact, we observe that this converges after a small fixed number
+ * of iterations, usually one.
+ */
+#include "./generalize.h"
+#ifdef AO_GENERALIZE_TWICE
+# include "./generalize.h"
+#endif
+
+/* For compatibility with version 0.4 and earlier       */
+#define AO_TS_T AO_TS_t
+#define AO_T AO_t
+#define AO_TS_VAL AO_TS_VAL_t
+
+#endif /* ATOMIC_OPS_H */
--- a/c_src/atomic_ops/generalize-small.h
+++ b/c_src/atomic_ops/generalize-small.h
--- a/c_src/atomic_ops/generalize.h
+++ b/c_src/atomic_ops/generalize.h
--- a/c_src/atomic_ops/ia64.h
+++ b/c_src/atomic_ops/ia64.h
@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2003 Hewlett-Packard Development Company, L.P.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "./aligned_atomic_load_store.h"
+
+#include "./all_acquire_release_volatile.h"
+
+#include "./test_and_set_t_is_char.h"
+
+#ifdef _ILP32
+  /* 32-bit HP/UX code. */
+  /* This requires pointer "swizzling".  Pointers need to be expanded   */
+  /* to 64 bits using the addp4 instruction before use.  This makes it  */
+  /* hard to share code, but we try anyway.                             */
+# define AO_LEN "4"
+  /* We assume that addr always appears in argument position 1 in asm   */
+  /* code.  If it is clobbered due to swizzling, we also need it in     */
+  /* second position.  Any later arguments are referenced symbolically, */
+  /* so that we don't have to worry about their position.  This requires*/
+  /* gcc 3.1, but you shouldn't be using anything older than that on    */
+  /* IA64 anyway.                                                       */
+  /* The AO_MASK macro is a workaround for the fact that HP/UX gcc      */
+  /* appears to otherwise store 64-bit pointers in ar.ccv, i.e. it      */
+  /* doesn't appear to clear high bits in a pointer value we pass into  */
+  /* assembly code, even if it is supposedly of type AO_t.              */
+# define AO_IN_ADDR "1"(addr)
+# define AO_OUT_ADDR , "=r"(addr)
+# define AO_SWIZZLE "addp4 %1=0,%1;;\n"
+# define AO_MASK(ptr) __asm__("zxt4 %1=%1": "=r"(ptr) : "0"(ptr));
+#else
+# define AO_LEN "8"
+# define AO_IN_ADDR "r"(addr)
+# define AO_OUT_ADDR
+# define AO_SWIZZLE
+# define AO_MASK(ptr)
+#endif
+
+AO_INLINE void
+AO_nop_full(void)
+{
+  __asm__ __volatile__("mf" : : : "memory");
+}
+#define AO_HAVE_nop_full
+
+AO_INLINE AO_t
+AO_fetch_and_add1_acquire (volatile AO_t *addr)
+{
+  AO_t result;
+
+  __asm__ __volatile__ (AO_SWIZZLE
+                        "fetchadd" AO_LEN ".acq %0=[%1],1":
+                        "=r" (result) AO_OUT_ADDR: AO_IN_ADDR :"memory");
+  return result;
+}
+#define AO_HAVE_fetch_and_add1_acquire
+
+AO_INLINE AO_t
+AO_fetch_and_add1_release (volatile AO_t *addr)
+{
+  AO_t result;
+
+  __asm__ __volatile__ (AO_SWIZZLE
+                        "fetchadd" AO_LEN ".rel %0=[%1],1":
+                        "=r" (result) AO_OUT_ADDR: AO_IN_ADDR :"memory");
+  return result;
+}
+
+#define AO_HAVE_fetch_and_add1_release
+
+AO_INLINE AO_t
+AO_fetch_and_sub1_acquire (volatile AO_t *addr)
+{
+  AO_t result;
+
+  __asm__ __volatile__ (AO_SWIZZLE
+                        "fetchadd" AO_LEN ".acq %0=[%1],-1":
+                        "=r" (result) AO_OUT_ADDR: AO_IN_ADDR :"memory");
+  return result;
+}
+
+#define AO_HAVE_fetch_and_sub1_acquire
+
+AO_INLINE AO_t
+AO_fetch_and_sub1_release (volatile AO_t *addr)
+{
+  AO_t result;
+
+  __asm__ __volatile__ (AO_SWIZZLE
+                        "fetchadd" AO_LEN ".rel %0=[%1],-1":
+                        "=r" (result) AO_OUT_ADDR: AO_IN_ADDR :"memory");
+  return result;
+}
+
+#define AO_HAVE_fetch_and_sub1_release
+
+#ifndef _ILP32
+
+AO_INLINE unsigned int
+AO_int_fetch_and_add1_acquire (volatile unsigned int *addr)
+{
+  unsigned int result;
+
+  __asm__ __volatile__ ("fetchadd4.acq %0=[%1],1":
+                        "=r" (result): AO_IN_ADDR :"memory");
+  return result;
+}
+#define AO_HAVE_int_fetch_and_add1_acquire
+
+AO_INLINE unsigned int
+AO_int_fetch_and_add1_release (volatile unsigned int *addr)
+{
+  unsigned int result;
+
+  __asm__ __volatile__ ("fetchadd4.rel %0=[%1],1":
+                        "=r" (result): AO_IN_ADDR :"memory");
+  return result;
+}
+
+#define AO_HAVE_int_fetch_and_add1_release
+
+AO_INLINE unsigned int
+AO_int_fetch_and_sub1_acquire (volatile unsigned int *addr)
+{
+  unsigned int result;
+
+  __asm__ __volatile__ ("fetchadd4.acq %0=[%1],-1":
+                        "=r" (result): AO_IN_ADDR :"memory");
+  return result;
+}
+
+#define AO_HAVE_int_fetch_and_sub1_acquire
+
+AO_INLINE unsigned int
+AO_int_fetch_and_sub1_release (volatile unsigned int *addr)
+{
+  unsigned int result;
+
+  __asm__ __volatile__ ("fetchadd4.rel %0=[%1],-1":
+                        "=r" (result): AO_IN_ADDR :"memory");
+  return result;
+}
+
+#define AO_HAVE_int_fetch_and_sub1_release
+
+#endif /* !_ILP32 */
+
+AO_INLINE int
+AO_compare_and_swap_acquire(volatile AO_t *addr,
+                             AO_t old, AO_t new_val)
+{
+  AO_t oldval;
+  AO_MASK(old);
+  __asm__ __volatile__(AO_SWIZZLE
+                       "mov ar.ccv=%[old] ;; cmpxchg" AO_LEN
+                       ".acq %0=[%1],%[new_val],ar.ccv"
+                       : "=r"(oldval) AO_OUT_ADDR
+                       : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"(old)
+                       : "memory");
+  return (oldval == old);
+}
+
+#define AO_HAVE_compare_and_swap_acquire
+
+AO_INLINE int
+AO_compare_and_swap_release(volatile AO_t *addr,
+                             AO_t old, AO_t new_val)
+{
+  AO_t oldval;
+  AO_MASK(old);
+  __asm__ __volatile__(AO_SWIZZLE
+                       "mov ar.ccv=%[old] ;; cmpxchg" AO_LEN
+                       ".rel %0=[%1],%[new_val],ar.ccv"
+                       : "=r"(oldval) AO_OUT_ADDR
+                       : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"(old)
+                       : "memory");
+  return (oldval == old);
+}
+
+#define AO_HAVE_compare_and_swap_release
+
+AO_INLINE int
+AO_char_compare_and_swap_acquire(volatile unsigned char *addr,
+                                 unsigned char old, unsigned char new_val)
+{
+  unsigned char oldval;
+  __asm__ __volatile__(AO_SWIZZLE
+               "mov ar.ccv=%[old] ;; cmpxchg1.acq %0=[%1],%[new_val],ar.ccv"
+               : "=r"(oldval) AO_OUT_ADDR
+               : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"((AO_t)old)
+               : "memory");
+  return (oldval == old);
+}
+
+#define AO_HAVE_char_compare_and_swap_acquire
+
+AO_INLINE int
+AO_char_compare_and_swap_release(volatile unsigned char *addr,
+                                 unsigned char old, unsigned char new_val)
+{
+  unsigned char oldval;
+  __asm__ __volatile__(AO_SWIZZLE
+                "mov ar.ccv=%[old] ;; cmpxchg1.rel %0=[%1],%[new_val],ar.ccv"
+                : "=r"(oldval) AO_OUT_ADDR
+                : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"((AO_t)old)
+                : "memory");
+  return (oldval == old);
+}
+
+#define AO_HAVE_char_compare_and_swap_release
+
+AO_INLINE int
+AO_short_compare_and_swap_acquire(volatile unsigned short *addr,
+                                  unsigned short old, unsigned short new_val)
+{
+  unsigned short oldval;
+  __asm__ __volatile__(AO_SWIZZLE
+                "mov ar.ccv=%[old] ;; cmpxchg2.acq %0=[%1],%[new_val],ar.ccv"
+                : "=r"(oldval) AO_OUT_ADDR
+                : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"((AO_t)old)
+                : "memory");
+  return (oldval == old);
+}
+
+#define AO_HAVE_short_compare_and_swap_acquire
+
+AO_INLINE int
+AO_short_compare_and_swap_release(volatile unsigned short *addr,
+                                  unsigned short old, unsigned short new_val)
+{
+  unsigned short oldval;
+  __asm__ __volatile__(AO_SWIZZLE
+                "mov ar.ccv=%[old] ;; cmpxchg2.rel %0=[%1],%[new_val],ar.ccv"
+                : "=r"(oldval) AO_OUT_ADDR
+                : AO_IN_ADDR, [new_val]"r"(new_val), [old]"r"((AO_t)old)
+                : "memory");
+  return (oldval == old);
+}
+
+#define AO_HAVE_short_compare_and_swap_release
+
+#ifndef _ILP32
+
+AO_INLINE int
+AO_int_compare_and_swap_acquire(volatile unsigned int *addr,
+                                unsigned int old, unsigned int new_val)
+{
+  unsigned int oldval;
+  __asm__ __volatile__("mov ar.ccv=%3 ;; cmpxchg4.acq %0=[%1],%2,ar.ccv"
+                       : "=r"(oldval)
+                       : AO_IN_ADDR, "r"(new_val), "r"((AO_t)old) : "memory");
+  return (oldval == old);
+}
+
+#define AO_HAVE_int_compare_and_swap_acquire
+
+AO_INLINE int
+AO_int_compare_and_swap_release(volatile unsigned int *addr,
+                                unsigned int old, unsigned int new_val)
+{
+  unsigned int oldval;
+  __asm__ __volatile__("mov ar.ccv=%3 ;; cmpxchg4.rel %0=[%1],%2,ar.ccv"
+                       : "=r"(oldval)
+                       : AO_IN_ADDR, "r"(new_val), "r"((AO_t)old) : "memory");
+  return (oldval == old);
+}
+
+#define AO_HAVE_int_compare_and_swap_release
+
+#endif /* !_ILP32 */
+
+/* FIXME: Add compare_and_swap_double as soon as there is widely        */
+/* available hardware that implements it.                               */
+
+/* FIXME: Add compare_double_and_swap_double for the _ILP32 case.       */
+
+#ifdef _ILP32
+# include "./ao_t_is_int.h"
+#endif
--- a/c_src/atomic_ops/ordered_except_wr.h
+++ b/c_src/atomic_ops/ordered_except_wr.h
@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2003 by Hewlett-Packard Company.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * These are common definitions for architectures that provide processor
+ * ordered memory operations except that a later read may pass an
+ * earlier write.  Real x86 implementations seem to be in this category,
+ * except apparently for some IDT WinChips, which we ignore.
+ */
+
+#include "read_ordered.h"
+
+AO_INLINE void
+AO_nop_write(void)
+{
+  AO_compiler_barrier();
+  /* sfence according to Intel docs.  Pentium 3 and up. */
+  /* Unnecessary for cached accesses?                   */
+}
+
+#define AO_HAVE_NOP_WRITE
+
+#if defined(AO_HAVE_store)
+
+AO_INLINE void
+AO_store_write(volatile AO_t *addr, AO_t val)
+{
+  AO_compiler_barrier();
+  AO_store(addr, val);
+}
+# define AO_HAVE_store_write
+
+# define AO_store_release(addr, val) AO_store_write(addr, val)
+# define AO_HAVE_store_release
+
+#endif /* AO_HAVE_store */
+
+#if defined(AO_HAVE_char_store)
+
+AO_INLINE void
+AO_char_store_write(volatile unsigned char *addr, unsigned char val)
+{
+  AO_compiler_barrier();
+  AO_char_store(addr, val);
+}
+# define AO_HAVE_char_store_write
+
+# define AO_char_store_release(addr, val) AO_char_store_write(addr, val)
+# define AO_HAVE_char_store_release
+
+#endif /* AO_HAVE_char_store */
+
+#if defined(AO_HAVE_short_store)
+
+AO_INLINE void
+AO_short_store_write(volatile unsigned short *addr, unsigned short val)
+{
+  AO_compiler_barrier();
+  AO_short_store(addr, val);
+}
+# define AO_HAVE_short_store_write
+
+# define AO_short_store_release(addr, val) AO_short_store_write(addr, val)
+# define AO_HAVE_short_store_release
+
+#endif /* AO_HAVE_short_store */
+
+#if defined(AO_HAVE_int_store)
+
+AO_INLINE void
+AO_int_store_write(volatile unsigned int *addr, unsigned int val)
+{
+  AO_compiler_barrier();
+  AO_int_store(addr, val);
+}
+# define AO_HAVE_int_store_write
+
+# define AO_int_store_release(addr, val) AO_int_store_write(addr, val)
+# define AO_HAVE_int_store_release
+
+#endif /* AO_HAVE_int_store */
--- a/c_src/atomic_ops/powerpc.h
+++ b/c_src/atomic_ops/powerpc.h
@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
+ * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
+ * Copyright (c) 1999-2004 Hewlett-Packard Development Company, L.P.
+ *
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
+ * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program
+ * for any purpose,  provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is granted,
+ * provided the above notices are retained, and a notice that the code was
+ * modified is included with the above copyright notice.
+ *
+ */
+
+/* Memory model documented at http://www-106.ibm.com/developerworks/    */
+/* eserver/articles/archguide.html and (clearer)                        */
+/* http://www-106.ibm.com/developerworks/eserver/articles/powerpc.html. */
+/* There appears to be no implicit ordering between any kind of         */
+/* independent memory references.                                       */
+/* Architecture enforces some ordering based on control dependence.     */
+/* I don't know if that could help.                                     */
+/* Data-dependent loads are always ordered.                             */
+/* Based on the above references, eieio is intended for use on          */
+/* uncached memory, which we don't support.  It does not order loads    */
+/* from cached memory.                                                  */
+/* Thanks to Maged Michael, Doug Lea, and Roger Hoover for helping to   */
+/* track some of this down and correcting my misunderstandings. -HB     */
+/* Earl Chew subsequently contributed further fixes & additions.        */
+
+#include "./aligned_atomic_load_store.h"
+
+#include "./test_and_set_t_is_ao_t.h"
+        /* There seems to be no byte equivalent of lwarx, so this       */
+        /* may really be what we want, at least in the 32-bit case.     */
+
+AO_INLINE void
+AO_nop_full(void)
+{
+  __asm__ __volatile__("sync" : : : "memory");
+}
+
+#define AO_HAVE_nop_full
+
+/* lwsync apparently works for everything but a StoreLoad barrier.      */
+AO_INLINE void
+AO_lwsync(void)
+{
+#ifdef __NO_LWSYNC__
+  __asm__ __volatile__("sync" : : : "memory");
+#else
+  __asm__ __volatile__("lwsync" : : : "memory");
+#endif
+}
+
+#define AO_nop_write() AO_lwsync()
+#define AO_HAVE_nop_write
+
+#define AO_nop_read() AO_lwsync()
+#define AO_HAVE_nop_read
+
+/* We explicitly specify load_acquire, since it is important, and can   */
+/* be implemented relatively cheaply.  It could be implemented          */
+/* with an ordinary load followed by a lwsync.  But the general wisdom  */
+/* seems to be that a data dependent branch followed by an isync is     */
+/* cheaper.  And the documentation is fairly explicit that this also    */
+/* has acquire semantics.                                               */
+/* ppc64 uses ld not lwz */
+#if defined(__powerpc64__) || defined(__ppc64__) || defined(__64BIT__)
+AO_INLINE AO_t
+AO_load_acquire(const volatile AO_t *addr)
+{
+  AO_t result;
+
+   __asm__ __volatile__ (
+    "ld%U1%X1 %0,%1\n"
+    "cmpw %0,%0\n"
+    "bne- 1f\n"
+    "1: isync\n"
+    : "=r" (result)
+    : "m"(*addr) : "memory", "cr0");
+  return result;
+}
+#else
+AO_INLINE AO_t
+AO_load_acquire(const volatile AO_t *addr)
+{
+  AO_t result;
+
+  /* FIXME: We should get gcc to allocate one of the condition  */
+  /* registers.  I always got "impossible constraint" when I    */
+  /* tried the "y" constraint.                                  */
+  __asm__ __volatile__ (
+    "lwz%U1%X1 %0,%1\n"
+    "cmpw %0,%0\n"
+    "bne- 1f\n"
+    "1: isync\n"
+    : "=r" (result)
+    : "m"(*addr) : "memory", "cc");
+  return result;
+}
+#endif
+#define AO_HAVE_load_acquire
+
+/* We explicitly specify store_release, since it relies         */
+/* on the fact that lwsync is also a LoadStore barrier.         */
+AO_INLINE void
+AO_store_release(volatile AO_t *addr, AO_t value)
+{
+  AO_lwsync();
+  *addr = value;
+}
+
+#define AO_HAVE_load_acquire
+
+/* This is similar to the code in the garbage collector.  Deleting      */
+/* this and having it synthesized from compare_and_swap would probably  */
+/* only cost us a load immediate instruction.                           */
+#if defined(__powerpc64__) || defined(__ppc64__) || defined(__64BIT__)
+/* Completely untested.  And we should be using smaller objects anyway. */
+AO_INLINE AO_TS_VAL_t
+AO_test_and_set(volatile AO_TS_t *addr) {
+  unsigned long oldval;
+  unsigned long temp = 1; /* locked value */
+
+  __asm__ __volatile__(
+               "1:ldarx %0,0,%1\n"   /* load and reserve               */
+               "cmpdi %0, 0\n"       /* if load is                     */
+               "bne 2f\n"            /*   non-zero, return already set */
+               "stdcx. %2,0,%1\n"    /* else store conditional         */
+               "bne- 1b\n"           /* retry if lost reservation      */
+               "2:\n"                /* oldval is zero if we set       */
+              : "=&r"(oldval)
+              : "r"(addr), "r"(temp)
+              : "memory", "cr0");
+
+  return (AO_TS_VAL_t)oldval;
+}
+
+#else
+
+AO_INLINE AO_TS_VAL_t
+AO_test_and_set(volatile AO_TS_t *addr) {
+  int oldval;
+  int temp = 1; /* locked value */
+
+  __asm__ __volatile__(
+               "1:lwarx %0,0,%1\n"   /* load and reserve               */
+               "cmpwi %0, 0\n"       /* if load is                     */
+               "bne 2f\n"            /*   non-zero, return already set */
+               "stwcx. %2,0,%1\n"    /* else store conditional         */
+               "bne- 1b\n"           /* retry if lost reservation      */
+               "2:\n"                /* oldval is zero if we set       */
+              : "=&r"(oldval)
+              : "r"(addr), "r"(temp)
+              : "memory", "cr0");
+
+  return (AO_TS_VAL_t)oldval;
+}
+
+#endif
+
+#define AO_HAVE_test_and_set
+
+AO_INLINE AO_TS_VAL_t
+AO_test_and_set_acquire(volatile AO_TS_t *addr) {
+  AO_TS_VAL_t result = AO_test_and_set(addr);
+  AO_lwsync();
+  return result;
+}
+
+#define AO_HAVE_test_and_set_acquire
+
+AO_INLINE AO_TS_VAL_t
+AO_test_and_set_release(volatile AO_TS_t *addr) {
+  AO_lwsync();
+  return AO_test_and_set(addr);
+}
+
+#define AO_HAVE_test_and_set_release
+
+AO_INLINE AO_TS_VAL_t
+AO_test_and_set_full(volatile AO_TS_t *addr) {
+  AO_TS_VAL_t result;
+  AO_lwsync();
+  result = AO_test_and_set(addr);
+  AO_lwsync();
+  return result;
+}
+
+#define AO_HAVE_test_and_set_full
+
+#if defined(__powerpc64__) || defined(__ppc64__) || defined(__64BIT__)
+/* FIXME: Completely untested.  */
+AO_INLINE int
+AO_compare_and_swap(volatile AO_t *addr, AO_t old, AO_t new_val) {
+  AO_t oldval;
+  int result = 0;
+
+  __asm__ __volatile__(
+               "1:ldarx %0,0,%2\n"   /* load and reserve              */
+               "cmpd %0, %4\n"      /* if load is not equal to  */
+               "bne 2f\n"            /*   old, fail                     */
+               "stdcx. %3,0,%2\n"    /* else store conditional         */
+               "bne- 1b\n"           /* retry if lost reservation      */
+               "li %1,1\n"           /* result = 1;                     */
+               "2:\n"
+              : "=&r"(oldval), "=&r"(result)
+              : "r"(addr), "r"(new_val), "r"(old), "1"(result)
+              : "memory", "cr0");
+
+  return result;
+}
+
+#else
+
+AO_INLINE int
+AO_compare_and_swap(volatile AO_t *addr, AO_t old, AO_t new_val) {
+  AO_t oldval;
+  int result = 0;
+
+  __asm__ __volatile__(
+               "1:lwarx %0,0,%2\n"   /* load and reserve              */
+               "cmpw %0, %4\n"      /* if load is not equal to  */
+               "bne 2f\n"            /*   old, fail                     */
+               "stwcx. %3,0,%2\n"    /* else store conditional         */
+               "bne- 1b\n"           /* retry if lost reservation      */
+               "li %1,1\n"           /* result = 1;                     */
+               "2:\n"
+              : "=&r"(oldval), "=&r"(result)
+              : "r"(addr), "r"(new_val), "r"(old), "1"(result)
+              : "memory", "cr0");
+
+  return result;
+}
+#endif
+
+#define AO_HAVE_compare_and_swap
+
+AO_INLINE int
+AO_compare_and_swap_acquire(volatile AO_t *addr, AO_t old, AO_t new_val) {
+  int result = AO_compare_and_swap(addr, old, new_val);
+  AO_lwsync();
+  return result;
+}
+
+#define AO_HAVE_compare_and_swap_acquire
+
+AO_INLINE int
+AO_compare_and_swap_release(volatile AO_t *addr, AO_t old, AO_t new_val) {
+  AO_lwsync();
+  return AO_compare_and_swap(addr, old, new_val);
+}
+
+#define AO_HAVE_compare_and_swap_release
+
+AO_INLINE int
+AO_compare_and_swap_full(volatile AO_t *addr, AO_t old, AO_t new_val) {
+  AO_t result;
+  AO_lwsync();
+  result = AO_compare_and_swap(addr, old, new_val);
+  AO_lwsync();
+  return result;
+}
+
+#define AO_HAVE_compare_and_swap_full
+
+#if defined(__powerpc64__) || defined(__ppc64__) || defined(__64BIT__)
+/* FIXME: Completely untested.                                          */
+
+AO_INLINE AO_t
+AO_fetch_and_add(volatile AO_t *addr, AO_t incr) {
+  AO_t oldval;
+  AO_t newval;
+
+  __asm__ __volatile__(
+               "1:ldarx %0,0,%2\n"   /* load and reserve                */
+               "add %1,%0,%3\n"      /* increment                       */
+               "stdcx. %1,0,%2\n"    /* store conditional               */
+               "bne- 1b\n"           /* retry if lost reservation       */
+              : "=&r"(oldval), "=&r"(newval)
+               : "r"(addr), "r"(incr)
+              : "memory", "cr0");
+
+  return oldval;
+}
+
+#define AO_HAVE_fetch_and_add
+
+#else
+
+AO_INLINE AO_t
+AO_fetch_and_add(volatile AO_t *addr, AO_t incr) {
+  AO_t oldval;
+  AO_t newval;
+
+  __asm__ __volatile__(
+               "1:lwarx %0,0,%2\n"   /* load and reserve                */
+               "add %1,%0,%3\n"      /* increment                       */
+               "stwcx. %1,0,%2\n"    /* store conditional               */
+               "bne- 1b\n"           /* retry if lost reservation       */
+              : "=&r"(oldval), "=&r"(newval)
+               : "r"(addr), "r"(incr)
+              : "memory", "cr0");
+
+  return oldval;
+}
+
+#define AO_HAVE_fetch_and_add
+
+#endif
+
+AO_INLINE AO_t
+AO_fetch_and_add_acquire(volatile AO_t *addr, AO_t incr) {
+  AO_t result = AO_fetch_and_add(addr, incr);
+  AO_lwsync();
+  return result;
+}
+
+#define AO_HAVE_fetch_and_add_acquire
+
+AO_INLINE AO_t
+AO_fetch_and_add_release(volatile AO_t *addr, AO_t incr) {
+  AO_lwsync();
+  return AO_fetch_and_add(addr, incr);
+}
+
+#define AO_HAVE_fetch_and_add_release
+
+AO_INLINE AO_t
+AO_fetch_and_add_full(volatile AO_t *addr, AO_t incr) {
+  AO_t result;
+  AO_lwsync();
+  result = AO_fetch_and_add(addr, incr);
+  AO_lwsync();
+  return result;
+}
+
+#define AO_HAVE_fetch_and_add_full
+
+#if defined(__powerpc64__) || defined(__ppc64__) || defined(__64BIT__)
+#else
+# include "./ao_t_is_int.h"
+#endif
--- a/c_src/atomic_ops/read_ordered.h
+++ b/c_src/atomic_ops/read_ordered.h
@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2003 by Hewlett-Packard Company.  All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * These are common definitions for architectures that provide processor
+ * ordered memory operations except that a later read may pass an
+ * earlier write.  Real x86 implementations seem to be in this category,
+ * except apparently for some IDT WinChips, which we ignore.
+ */
+
+AO_INLINE void
+AO_nop_read(void)
+{
+  AO_compiler_barrier();
+}
+
+#define AO_HAVE_NOP_READ
+
+#ifdef AO_HAVE_load
+
+AO_INLINE AO_t
+AO_load_read(const volatile AO_t *addr)
+{
+  AO_t result = AO_load(addr);
+  AO_compiler_barrier();
+  return result;
+}
+#define AO_HAVE_load_read
+
+#define AO_load_acquire(addr) AO_load_read(addr)
+#define AO_HAVE_load_acquire
+
+#endif /* AO_HAVE_load */
+
+#ifdef AO_HAVE_char_load
+
+AO_INLINE AO_t
+AO_char_load_read(const volatile unsigned char *addr)
+{
+  AO_t result = AO_char_load(addr);
+  AO_compiler_barrier();
+  return result;
+}
+#define AO_HAVE_char_load_read
+
+#define AO_char_load_acquire(addr) AO_char_load_read(addr)
+#define AO_HAVE_char_load_acquire
+
+#endif /* AO_HAVE_char_load */
+
+#ifdef AO_HAVE_short_load
+
+AO_INLINE AO_t
+AO_short_load_read(const volatile unsigned short *addr)
+{
+  AO_t result = AO_short_load(addr);
+  AO_compiler_barrier();
+  return result;
+}
+#define AO_HAVE_short_load_read
+
+#define AO_short_load_acquire(addr) AO_short_load_read(addr)
+#define AO_HAVE_short_load_acquire
+
+#endif /* AO_HAVE_short_load */
+
+#ifdef AO_HAVE_int_load
+
+AO_INLINE AO_t
+AO_int_load_read(const volatile unsigned int *addr)
+{
+  AO_t result = AO_int_load(addr);
+  AO_compiler_barrier();
+  return result;
+}
+#define AO_HAVE_int_load_read
+
+#define AO_int_load_acquire(addr) AO_int_load_read(addr)
+#define AO_HAVE_int_load_acquire
+
+#endif /* AO_HAVE_int_load */
--- a/c_src/atomic_ops/sparc.h
+++ b/c_src/atomic_ops/sparc.h
@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
+ * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
+ * Copyright (c) 1999-2003 by Hewlett-Packard Company. All rights reserved.
+ *
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
+ * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program
+ * for any purpose,  provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is granted,
+ * provided the above notices are retained, and a notice that the code was
+ * modified is included with the above copyright notice.
+ *
+ */
+
+/* FIXME.  Very incomplete.  No support for sparc64.    */
+/* Non-ancient SPARCs provide compare-and-swap (casa).  */
+/* We should make that available.                       */
+
+#include "./aligned_atomic_load_store.h"
+
+/* Real SPARC code uses TSO:                            */
+#include "./ordered_except_wr.h"
+
+/* Test_and_set location is just a byte.                */
+#include "./test_and_set_t_is_char.h"
+
+AO_INLINE AO_TS_VAL_t
+AO_test_and_set_full(volatile AO_TS_t *addr) {
+   AO_TS_VAL_t oldval;
+
+   __asm__ __volatile__("ldstub %1,%0"
+                        : "=r"(oldval), "=m"(*addr)
+                        : "m"(*addr) : "memory");
+   return oldval;
+}
+
+#define AO_HAVE_test_and_set_full
+
+#ifndef AO_NO_SPARC_V9
+/* Returns nonzero if the comparison succeeded. */
+AO_INLINE int
+AO_compare_and_swap_full(volatile AO_t *addr, AO_t old, AO_t new_val) {
+  char ret;
+  __asm__ __volatile__ ("membar #StoreLoad | #LoadLoad\n\t"
+#                       if defined(__arch64__)
+                          "casx [%2],%0,%1\n\t"
+#                       else
+                          "cas [%2],%0,%1\n\t" /* 32-bit version */
+#                       endif
+                        "membar #StoreLoad | #StoreStore\n\t"
+                        "cmp %0,%1\n\t"
+                        "be,a 0f\n\t"
+                        "mov 1,%0\n\t"/* one insn after branch always executed */
+                        "clr %0\n\t"
+                        "0:\n\t"
+                        : "=r" (ret), "+r" (new_val)
+                        : "r" (addr), "0" (old)
+                        : "memory", "cc");
+  return (int)ret;
+}
+
+#define AO_HAVE_compare_and_swap_full
+#endif /* AO_NO_SPARC_V9 */
+
+/* FIXME: This needs to be extended for SPARC v8 and v9.        */
+/* SPARC V8 also has swap.  V9 has CAS.                         */
+/* There are barriers like membar #LoadStore.                   */
+/* CASA (32-bit) and CASXA(64-bit) instructions were            */
+/* added in V9.                                                 */
--- a/c_src/atomic_ops/standard_ao_double_t.h
+++ b/c_src/atomic_ops/standard_ao_double_t.h
@ -0,0 +1,25 @@
+/* NEC LE-IT: For 64Bit OS we extend the double type to hold two int64's
+*
+*  x86-64: __m128 serves as placeholder which also requires the compiler
+*          to align     it on 16 byte boundary (as required by cmpxchg16.
+* Similar things could be done for PowerPC 64bit using a VMX data type...       */
+
+#if (defined(__x86_64__) && defined(__GNUC__)) || defined(_WIN64)
+# include <xmmintrin.h>
+  typedef __m128 double_ptr_storage;
+#elif defined(_WIN32) && !defined(__GNUC__)
+  typedef unsigned __int64 double_ptr_storage;
+#else
+  typedef unsigned long long double_ptr_storage;
+#endif
+
+# define AO_HAVE_DOUBLE_PTR_STORAGE
+
+typedef union {
+    double_ptr_storage AO_whole;
+    struct {AO_t AO_v1; AO_t AO_v2;} AO_parts;
+} AO_double_t;
+
+#define AO_HAVE_double_t
+#define AO_val1 AO_parts.AO_v1
+#define AO_val2 AO_parts.AO_v2
--- a/c_src/atomic_ops/test_and_set_t_is_ao_t.h
+++ b/c_src/atomic_ops/test_and_set_t_is_ao_t.h
@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2004 Hewlett-Packard Development Company, L.P.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * These are common definitions for architectures on which test_and_set
+ * operates on pointer-sized quantities, the "clear" value contains
+ * all zeroes, and the "set" value contains only one lowest bit set.
+ * This can be used if test_and_set is synthesized from compare_and_swap.
+ */
+typedef enum {AO_TS_clear = 0, AO_TS_set = 1} AO_TS_val;
+#define AO_TS_VAL_t AO_TS_val
+#define AO_TS_CLEAR AO_TS_clear
+#define AO_TS_SET AO_TS_set
+
+#define AO_TS_t AO_t
+
+#define AO_AO_TS_T 1
--- a/c_src/atomic_ops/test_and_set_t_is_char.h
+++ b/c_src/atomic_ops/test_and_set_t_is_char.h
@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2004 Hewlett-Packard Development Company, L.P.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE. 
+ */ 
+
+/*
+ * These are common definitions for architectures on which test_and_set
+ * operates on byte sized quantities, the "clear" value contains
+ * all zeroes, and the "set" value contains all ones.
+ */
+
+#define AO_TS_t unsigned char
+typedef enum {AO_BYTE_TS_clear = 0, AO_BYTE_TS_set = 0xff} AO_BYTE_TS_val;
+#define AO_TS_VAL_t AO_BYTE_TS_val
+#define AO_TS_CLEAR AO_BYTE_TS_clear
+#define AO_TS_SET AO_BYTE_TS_set
+
+#define AO_CHAR_TS_T 1
+
+
+
--- a/c_src/atomic_ops/x86.h
+++ b/c_src/atomic_ops/x86.h
@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
+ * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
+ * Copyright (c) 1999-2003 by Hewlett-Packard Company. All rights reserved.
+ *
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
+ * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program
+ * for any purpose,  provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is granted,
+ * provided the above notices are retained, and a notice that the code was
+ * modified is included with the above copyright notice.
+ *
+ * Some of the machine specific code was borrowed from our GC distribution.
+ */
+
+/* The following really assume we have a 486 or better.  Unfortunately  */
+/* gcc doesn't define a suitable feature test macro based on command    */
+/* line options.                                                        */
+/* We should perhaps test dynamically.                                  */
+
+#include "./aligned_atomic_load_store.h"
+
+/* Real X86 implementations, except for some old WinChips, appear       */
+/* to enforce ordering between memory operations, EXCEPT that a later   */
+/* read can pass earlier writes, presumably due to the visible          */
+/* presence of store buffers.                                           */
+/* We ignore both the WinChips, and the fact that the official specs    */
+/* seem to be much weaker (and arguably too weak to be usable).         */
+
+#include "./ordered_except_wr.h"
+
+#include "./test_and_set_t_is_char.h"
+
+#include "./standard_ao_double_t.h"
+
+#if defined(AO_USE_PENTIUM4_INSTRS)
+AO_INLINE void
+AO_nop_full(void)
+{
+  __asm__ __volatile__("mfence" : : : "memory");
+}
+
+#define AO_HAVE_nop_full
+
+#else
+
+/* We could use the cpuid instruction.  But that seems to be slower     */
+/* than the default implementation based on test_and_set_full.  Thus    */
+/* we omit that bit of misinformation here.                             */
+
+#endif
+
+/* As far as we can tell, the lfence and sfence instructions are not    */
+/* currently needed or useful for cached memory accesses.               */
+
+/* Really only works for 486 and later */
+AO_INLINE AO_t
+AO_fetch_and_add_full (volatile AO_t *p, AO_t incr)
+{
+  AO_t result;
+
+  __asm__ __volatile__ ("lock; xaddl %0, %1" :
+                        "=r" (result), "=m" (*p) : "0" (incr), "m" (*p)
+                        : "memory");
+  return result;
+}
+
+#define AO_HAVE_fetch_and_add_full
+
+AO_INLINE unsigned char
+AO_char_fetch_and_add_full (volatile unsigned char *p, unsigned char incr)
+{
+  unsigned char result;
+
+  __asm__ __volatile__ ("lock; xaddb %0, %1" :
+                        "=q" (result), "=m" (*p) : "0" (incr), "m" (*p)
+                        : "memory");
+  return result;
+}
+
+#define AO_HAVE_char_fetch_and_add_full
+
+AO_INLINE unsigned short
+AO_short_fetch_and_add_full (volatile unsigned short *p, unsigned short incr)
+{
+  unsigned short result;
+
+  __asm__ __volatile__ ("lock; xaddw %0, %1" :
+                        "=r" (result), "=m" (*p) : "0" (incr), "m" (*p)
+                        : "memory");
+  return result;
+}
+
+#define AO_HAVE_short_fetch_and_add_full
+
+/* Really only works for 486 and later */
+AO_INLINE void
+AO_or_full (volatile AO_t *p, AO_t incr)
+{
+  __asm__ __volatile__ ("lock; orl %1, %0" :
+                        "=m" (*p) : "r" (incr), "m" (*p) : "memory");
+}
+
+#define AO_HAVE_or_full
+
+AO_INLINE AO_TS_VAL_t
+AO_test_and_set_full(volatile AO_TS_t *addr)
+{
+  unsigned char oldval;
+  /* Note: the "xchg" instruction does not need a "lock" prefix */
+  __asm__ __volatile__("xchgb %0, %1"
+                : "=q"(oldval), "=m"(*addr)
+                : "0"((unsigned char)0xff), "m"(*addr) : "memory");
+  return (AO_TS_VAL_t)oldval;
+}
+
+#define AO_HAVE_test_and_set_full
+
+/* Returns nonzero if the comparison succeeded. */
+AO_INLINE int
+AO_compare_and_swap_full(volatile AO_t *addr, AO_t old, AO_t new_val)
+{
+# ifdef AO_USE_SYNC_CAS_BUILTIN
+    return (int)__sync_bool_compare_and_swap(addr, old, new_val);
+# else
+    char result;
+    __asm__ __volatile__("lock; cmpxchgl %3, %0; setz %1"
+                         : "=m" (*addr), "=a" (result)
+                         : "m" (*addr), "r" (new_val), "a" (old) : "memory");
+    return (int)result;
+# endif
+}
+
+#define AO_HAVE_compare_and_swap_full
+
+/* Returns nonzero if the comparison succeeded. */
+/* Really requires at least a Pentium.          */
+AO_INLINE int
+AO_compare_double_and_swap_double_full(volatile AO_double_t *addr,
+                                       AO_t old_val1, AO_t old_val2,
+                                       AO_t new_val1, AO_t new_val2)
+{
+  char result;
+#if __PIC__
+  /* If PIC is turned on, we can't use %ebx as it is reserved for the
+     GOT pointer.  We can save and restore %ebx because GCC won't be
+     using it for anything else (such as any of the m operands) */
+  __asm__ __volatile__("pushl %%ebx;"   /* save ebx used for PIC GOT ptr */
+                       "movl %6,%%ebx;" /* move new_val2 to %ebx */
+                       "lock; cmpxchg8b %0; setz %1;"
+                       "pop %%ebx;"     /* restore %ebx */
+                       : "=m"(*addr), "=a"(result)
+                       : "m"(*addr), "d" (old_val2), "a" (old_val1),
+                         "c" (new_val2), "m" (new_val1) : "memory");
+#else
+  /* We can't just do the same thing in non-PIC mode, because GCC
+   * might be using %ebx as the memory operand.  We could have ifdef'd
+   * in a clobber, but there's no point doing the push/pop if we don't
+   * have to. */
+  __asm__ __volatile__("lock; cmpxchg8b %0; setz %1;"
+                       : "=m"(*addr), "=a"(result)
+                       : "m"(*addr), "d" (old_val2), "a" (old_val1),
+                         "c" (new_val2), "b" (new_val1) : "memory");
+#endif
+  return (int) result;
+}
+
+#define AO_HAVE_compare_double_and_swap_double_full
+
+#include "./ao_t_is_int.h"
--- a/c_src/atomic_ops/x86_64.h
+++ b/c_src/atomic_ops/x86_64.h
@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 1991-1994 by Xerox Corporation.  All rights reserved.
+ * Copyright (c) 1996-1999 by Silicon Graphics.  All rights reserved.
+ * Copyright (c) 1999-2003 by Hewlett-Packard Company. All rights reserved.
+ *
+ *
+ * THIS MATERIAL IS PROVIDED AS IS, WITH ABSOLUTELY NO WARRANTY EXPRESSED
+ * OR IMPLIED.  ANY USE IS AT YOUR OWN RISK.
+ *
+ * Permission is hereby granted to use or copy this program
+ * for any purpose,  provided the above notices are retained on all copies.
+ * Permission to modify the code and to distribute modified code is granted,
+ * provided the above notices are retained, and a notice that the code was
+ * modified is included with the above copyright notice.
+ *
+ * Some of the machine specific code was borrowed from our GC distribution.
+ */
+
+#include "./aligned_atomic_load_store.h"
+
+/* Real X86 implementations appear                                      */
+/* to enforce ordering between memory operations, EXCEPT that a later   */
+/* read can pass earlier writes, presumably due to the visible          */
+/* presence of store buffers.                                           */
+/* We ignore the fact that the official specs                           */
+/* seem to be much weaker (and arguably too weak to be usable).         */
+
+#include "./ordered_except_wr.h"
+
+#include "./test_and_set_t_is_char.h"
+
+#include "./standard_ao_double_t.h"
+
+AO_INLINE void
+AO_nop_full(void)
+{
+  /* Note: "mfence" (SSE2) is supported on all x86_64/amd64 chips.      */
+  __asm__ __volatile__("mfence" : : : "memory");
+}
+
+#define AO_HAVE_nop_full
+
+/* As far as we can tell, the lfence and sfence instructions are not    */
+/* currently needed or useful for cached memory accesses.               */
+
+AO_INLINE AO_t
+AO_fetch_and_add_full (volatile AO_t *p, AO_t incr)
+{
+  AO_t result;
+
+  __asm__ __volatile__ ("lock; xaddq %0, %1" :
+                        "=r" (result), "=m" (*p) : "0" (incr), "m" (*p)
+                        : "memory");
+  return result;
+}
+
+#define AO_HAVE_fetch_and_add_full
+
+AO_INLINE unsigned char
+AO_char_fetch_and_add_full (volatile unsigned char *p, unsigned char incr)
+{
+  unsigned char result;
+
+  __asm__ __volatile__ ("lock; xaddb %0, %1" :
+                        "=q" (result), "=m" (*p) : "0" (incr), "m" (*p)
+                        : "memory");
+  return result;
+}
+
+#define AO_HAVE_char_fetch_and_add_full
+
+AO_INLINE unsigned short
+AO_short_fetch_and_add_full (volatile unsigned short *p, unsigned short incr)
+{
+  unsigned short result;
+
+  __asm__ __volatile__ ("lock; xaddw %0, %1" :
+                        "=r" (result), "=m" (*p) : "0" (incr), "m" (*p)
+                        : "memory");
+  return result;
+}
+
+#define AO_HAVE_short_fetch_and_add_full
+
+AO_INLINE unsigned int
+AO_int_fetch_and_add_full (volatile unsigned int *p, unsigned int incr)
+{
+  unsigned int result;
+
+  __asm__ __volatile__ ("lock; xaddl %0, %1" :
+                        "=r" (result), "=m" (*p) : "0" (incr), "m" (*p)
+                        : "memory");
+  return result;
+}
+
+#define AO_HAVE_int_fetch_and_add_full
+
+AO_INLINE void
+AO_or_full (volatile AO_t *p, AO_t incr)
+{
+  __asm__ __volatile__ ("lock; orq %1, %0" :
+                        "=m" (*p) : "r" (incr), "m" (*p) : "memory");
+}
+
+#define AO_HAVE_or_full
+
+AO_INLINE AO_TS_VAL_t
+AO_test_and_set_full(volatile AO_TS_t *addr)
+{
+  unsigned char oldval;
+  /* Note: the "xchg" instruction does not need a "lock" prefix */
+  __asm__ __volatile__("xchgb %0, %1"
+                : "=q"(oldval), "=m"(*addr)
+                : "0"((unsigned char)0xff), "m"(*addr) : "memory");
+  return (AO_TS_VAL_t)oldval;
+}
+
+#define AO_HAVE_test_and_set_full
+
+/* Returns nonzero if the comparison succeeded. */
+AO_INLINE int
+AO_compare_and_swap_full(volatile AO_t *addr, AO_t old, AO_t new_val)
+{
+# ifdef AO_USE_SYNC_CAS_BUILTIN
+    return (int)__sync_bool_compare_and_swap(addr, old, new_val);
+# else
+    char result;
+    __asm__ __volatile__("lock; cmpxchgq %3, %0; setz %1"
+                         : "=m" (*addr), "=a" (result)
+                         : "m" (*addr), "r" (new_val), "a" (old) : "memory");
+    return (int) result;
+# endif
+}
+
+#define AO_HAVE_compare_and_swap_full
+
+#ifdef AO_CMPXCHG16B_AVAILABLE
+/* NEC LE-IT: older AMD Opterons are missing this instruction.
+ * On these machines SIGILL will be thrown.
+ * Define AO_WEAK_DOUBLE_CAS_EMULATION to have an emulated
+ * (lock based) version available */
+/* HB: Changed this to not define either by default.  There are
+ * enough machines and tool chains around on which cmpxchg16b
+ * doesn't work.  And the emulation is unsafe by our usual rules.
+ * Hoewever both are clearly useful in certain cases.
+ */
+AO_INLINE int
+AO_compare_double_and_swap_double_full(volatile AO_double_t *addr,
+                                       AO_t old_val1, AO_t old_val2,
+                                       AO_t new_val1, AO_t new_val2)
+{
+  char result;
+  __asm__ __volatile__("lock; cmpxchg16b %0; setz %1"
+                       : "=m"(*addr), "=a"(result)
+                       : "m"(*addr), "d" (old_val2), "a" (old_val1),
+                         "c" (new_val2), "b" (new_val1) : "memory");
+  return (int) result;
+}
+#define AO_HAVE_compare_double_and_swap_double_full
+#else
+/* this one provides spinlock based emulation of CAS implemented in     */
+/* atomic_ops.c.  We probably do not want to do this here, since it is  */
+/* not atomic with respect to other kinds of updates of *addr.  On the  */
+/* other hand, this may be a useful facility on occasion.               */
+#ifdef AO_WEAK_DOUBLE_CAS_EMULATION
+int AO_compare_double_and_swap_double_emulation(volatile AO_double_t *addr,
+                                                AO_t old_val1, AO_t old_val2,
+                                                AO_t new_val1, AO_t new_val2);
+
+AO_INLINE int
+AO_compare_double_and_swap_double_full(volatile AO_double_t *addr,
+                                       AO_t old_val1, AO_t old_val2,
+                                       AO_t new_val1, AO_t new_val2)
+{
+        return AO_compare_double_and_swap_double_emulation(addr,
+                                                           old_val1, old_val2,
+                                                           new_val1, new_val2);
+}
+#define AO_HAVE_compare_double_and_swap_double_full
+#endif /* AO_WEAK_DOUBLE_CAS_EMULATION */
+#endif /* AO_CMPXCHG16B_AVAILABLE */
--- a/c_src/cas.h
+++ b/c_src/cas.h
@ -1,159 +0,0 @@
-/*
- * wterl: an Erlang NIF for WiredTiger
- *
- * Copyright (c) 2012-2013 Basho Technologies, Inc. All Rights Reserved.
- *
- * This file is provided to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at:
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
- * License for the specific language governing permissions and limitations
- * under the License.
- *
- */
-
-/*
- * Most of the following source code is copied directly from: "The Lock-Free
- * Library" (http://www.cl.cam.ac.uk/research/srg/netos/lock-free/) reused and
- * redistrubuted in accordance with their license:
- *
- * Copyright (c) 2002-2003 K A Fraser, All Rights Reserved.
- *
- * * Redistribution and use in source and binary forms, with or without
- *   modification, are permitted provided that the following conditions are
- *   met:
- *
- * * Redistributions of source code must retain the above copyright notice,
- *   this list of conditions and the following disclaimer.
- *
- * * Redistributions in binary form must reproduce the above copyright notice,
- *   this list of conditions and the following disclaimer in the documentation
- *   and/or other materials provided with the distribution.
- *
- * * The name of the author may not be used to endorse or promote products
- *   derived from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
- * EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
- * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
- * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
- * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __CAS_H_
-#define __CAS_H_
-
-#define CACHE_LINE_SIZE 64
-
-#define ATOMIC_INCR(_v,_newval)                                         \
-do {                                                                    \
-    __typeof(_v) __val = (_v);                                          \
-    while ( (_newval = CASIO(&(_v),__val,__val+1)) != __val )           \
-        __val = _newval;                                                \
-} while ( 0 )
-#define ATOMIC_ADD_TO(_v,_x)                                            \
-do {                                                                    \
-    __typeof(_v) __val = (_v), __newval;                                \
-    while ( (__newval = CASIO(&(_v),__val,__val+(_x))) != __val )       \
-        __val = __newval;                                               \
-} while ( 0 )
-
-#define ATOMIC_SET_TO(_v,_x)                                            \
-do {                                                                    \
-    int __val = (_v), __newval;                                         \
-    while ( (__newval = CASIO(&(_v),__val,__val=(_x))) != __val )       \
-        __val = __newval;                                               \
-} while ( 0 )
-
-#define CACHE_ALIGNED_SIZEOF(_s)                                      \
-    ((sizeof(_s)) + CACHE_LINE_SIZE*2) +                              \
-    CACHE_LINE_SIZE - 1) & ~(CACHE_LINE_SIZE-1)))                     \
-
-/*
- * I. Compare-and-swap.
- */
-
-/*
- * This is a strong barrier! Reads cannot be delayed beyond a later store.
- * Reads cannot be hoisted beyond a LOCK prefix. Stores always in-order.
- */
-#define CAS(_a, _o, _n)                                    \
-({ __typeof__(_o) __o = _o;                                \
-   __asm__ __volatile__(                                   \
-       "lock cmpxchg %3,%1"                                \
-       : "=a" (__o), "=m" (*(volatile unsigned int *)(_a)) \
-       :  "0" (__o), "r" (_n) );                           \
-   __o;                                                    \
-})
-
-#define FAS(_a, _n)                                        \
-({ __typeof__(_n) __o;                                     \
-   __asm__ __volatile__(                                   \
-       "lock xchg %0,%1"                                   \
-       : "=r" (__o), "=m" (*(volatile unsigned int *)(_a)) \
-       :  "0" (_n) );                                      \
-   __o;                                                    \
-})
-
-#define CAS64(_a, _o, _n)                                        \
-({ __typeof__(_o) __o = _o;                                      \
-   __asm__ __volatile__(                                         \
-       "movl %3, %%ecx;"                                         \
-       "movl %4, %%ebx;"                                         \
-       "lock cmpxchg8b %1"                                       \
-       : "=A" (__o), "=m" (*(volatile unsigned long long *)(_a)) \
-       : "0" (__o), "m" (_n >> 32), "m" (_n)                     \
-       : "ebx", "ecx" );                                         \
-   __o;                                                          \
-})
-
-/* Update Integer location, return Old value. */
-#define CASIO CAS
-#define FASIO FAS
-/* Update Pointer location, return Old value. */
-#define CASPO CAS
-#define FASPO FAS
-/* Update 32/64-bit location, return Old value. */
-#define CAS32O CAS
-#define CAS64O CAS64
-
-/*
- * II. Memory barriers.
- *  WMB(): All preceding write operations must commit before any later writes.
- *  RMB(): All preceding read operations must commit before any later reads.
- *  MB():  All preceding memory accesses must commit before any later accesses.
- *
- *  If the compiler does not observe these barriers (but any sane compiler
- *  will!), then VOLATILE should be defined as 'volatile'.
- */
-
-#define MB()  __asm__ __volatile__ ("lock; addl $0,0(%%esp)" : : : "memory")
-#define WMB() __asm__ __volatile__ ("" : : : "memory")
-#define RMB() MB()
-#define VOLATILE /*volatile*/
-
-/* On Intel, CAS is a strong barrier, but not a compile barrier. */
-#define RMB_NEAR_CAS() WMB()
-#define WMB_NEAR_CAS() WMB()
-#define MB_NEAR_CAS()  WMB()
-
-
-/*
- * III. Cycle counter access.
- */
-
-typedef unsigned long long tick_t;
-#define RDTICK() \
-    ({ tick_t __t; __asm__ __volatile__ ("rdtsc" : "=A" (__t)); __t; })
-
-#endif /* __CAS_H_ */
--- a/c_src/wterl.c
+++ b/c_src/wterl.c
@ -31,7 +31,7 @@
 #include "common.h"
 #include "async_nif.h"
 #include "queue.h"
-#include "cas.h"
+#include "atomic.h"

 #define MAX_CACHE_SIZE ASYNC_NIF_MAX_WORKERS

@ -325,25 +325,22 @@ __retain_ctx(WterlConnHandle *conn_handle, uint32_t worker_id,
    va_end(ap);

    *ctx = NULL;
-
    c = conn_handle->mru_ctx[worker_id];
-    if (CASPO(&conn_handle->mru_ctx[worker_id], c, 0) == c) {
-        if (c == 0) {
-            // mru miss:
-            DPRINTF("[%.4u] mru miss, empty", worker_id);
-            *ctx = NULL;
-        } else {
-            if (c->sig == sig) {
-                // mru hit:
-                DPRINTF("[%.4u] mru hit: %llu found", worker_id, PRIuint64(sig));
-                *ctx = c;
-            } else {
-                // mru mismatch:
-                DPRINTF("[%.4u] mru miss: %llu != %llu", worker_id, PRIuint64(sig), PRIuint64(c->sig));
-                __ctx_cache_add(conn_handle, c);
-                *ctx = NULL;
-            }
-        }
+    if (ATOMIC_CAS_FULL(&conn_handle->mru_ctx[worker_id], c, 0) && c != NULL) {
+	if (c->sig == sig) {
+	    // mru hit:
+	    DPRINTF("[%.4u] mru hit: %llu found", worker_id, PRIuint64(sig));
+	    *ctx = c;
+	} else {
+	    // mru mismatch:
+	    DPRINTF("[%.4u] mru miss: %llu != %llu", worker_id, PRIuint64(sig), PRIuint64(c->sig));
+	    __ctx_cache_add(conn_handle, c);
+	    *ctx = NULL;
+	}
+    } else {
+	// mru miss:
+	DPRINTF("[%.4u] mru miss, empty", worker_id);
+	*ctx = NULL;
    }

    if (*ctx == NULL) {
@ -411,16 +408,12 @@ __release_ctx(WterlConnHandle *conn_handle, uint32_t worker_id, struct wterl_ctx
    }

    c = conn_handle->mru_ctx[worker_id];
-    if (CASPO(&conn_handle->mru_ctx[worker_id], c, ctx) != c) {
+    if (ATOMIC_CAS_FULL(&conn_handle->mru_ctx[worker_id], c, ctx) && c != NULL) {
+	__ctx_cache_add(conn_handle, c);
+	DPRINTF("[%.4u] reset %d cursors, returned ctx to cache", worker_id, ctx->num_cursors);
+    } else {
        __ctx_cache_add(conn_handle, ctx);
        DPRINTF("[%.4u] reset %d cursors, returnd ctx to cache", worker_id, ctx->num_cursors);
-    } else {
-        if (c != NULL) {
-            __ctx_cache_add(conn_handle, c);
-            DPRINTF("[%.4u] reset %d cursors, returned ctx to cache", worker_id, ctx->num_cursors);
-        } else {
-            DPRINTF("[%.4u] reset %d cursors, returned ctx to mru", worker_id, ctx->num_cursors);
-        }
    }
 }

@ -437,14 +430,13 @@ __close_all_sessions(WterlConnHandle *conn_handle)

    // clear out the mru
    for (worker_id = 0; worker_id < ASYNC_NIF_MAX_WORKERS; worker_id++) {
-        do {
-            c = conn_handle->mru_ctx[worker_id];
-        } while(CASPO(&conn_handle->mru_ctx[worker_id], c, 0) != c);
-
-        if (c != 0) {
-            c->session->close(c->session, NULL);
-            enif_free(c);
-        }
+        c = conn_handle->mru_ctx[worker_id];
+        if (ATOMIC_CAS_FULL(&conn_handle->mru_ctx[worker_id], c, 0)) {
+	    if (c) {
+		c->session->close(c->session, NULL);
+		enif_free(c);
+	    }
+	}
    }

    // clear out the cache
@ -472,8 +464,8 @@ __close_cursors_on(WterlConnHandle *conn_handle, const char *uri)

    // walk the mru first, look for open cursors on matching uri
    for (worker_id = 0; worker_id < ASYNC_NIF_MAX_WORKERS; worker_id++) {
-        c = conn_handle->mru_ctx[worker_id];
-        if (CASPO(&conn_handle->mru_ctx[worker_id], c, 0) == c && c != 0) {
+	c = conn_handle->mru_ctx[worker_id];
+        if (ATOMIC_CAS_FULL(&conn_handle->mru_ctx[worker_id], c, 0) && c != 0) {
            cnt = c->num_cursors;
            for(idx = 0; idx < cnt; idx++) {
                if (!strcmp(c->ci[idx].uri, uri)) {
@ -481,9 +473,11 @@ __close_cursors_on(WterlConnHandle *conn_handle, const char *uri)
                    enif_free(c);
                    break;
                } else {
-                    if (CASPO(&conn_handle->mru_ctx[worker_id], 0, c) != 0) {
+		    // not a match, put it back on the mru
+		    struct wterl_ctx *l = conn_handle->mru_ctx[worker_id];
+                    if (!ATOMIC_CAS_FULL(&conn_handle->mru_ctx[worker_id], l, c))
                        __ctx_cache_add(conn_handle, c);
-                    }
+		    if (l) __ctx_cache_add(conn_handle, l);
                }
            }
        }