From 6bc97fb756cbd1fa962bbeed3d50fb55063dff91 Mon Sep 17 00:00:00 2001
From: Michael Hamburg <mike@shiftleft.org>
Date: Thu, 14 Jan 2016 18:11:00 -0800
Subject: [PATCH] need an include/arch_*/arch_intrinsics.h for other arches

---
 Makefile                                  |  14 +-
 src/include/arch_x86_64/arch_intrinsics.h | 306 ++++++++++++++++++++
 src/include/word.h                        |   3 +-
 src/p25519/arch_ref64/f_impl.c            |  45 +--
 src/p25519/arch_x86_64/f_impl.c           |  82 ++----
 src/p25519/arch_x86_64/x86-64-arith.h     | 323 ----------------------
 src/p448/arch_32/f_impl.c                 |  80 ++----
 src/p448/arch_arm_32/f_impl.c             |  51 ++--
 src/p448/arch_neon_experimental/f_impl.c  |  43 +--
 src/p448/arch_ref64/f_impl.c              |  43 +--
 src/p448/arch_x86_64/f_impl.c             |  46 +--
 src/p448/arch_x86_64/x86-64-arith.h       | 323 ----------------------
 src/p480/arch_x86_64/f_impl.c             |  43 +--
 src/p480/arch_x86_64/x86-64-arith.h       | 275 ------------------
 src/p521/arch_ref64/f_impl.c              |  43 +--
 src/p521/arch_x86_64_r12/f_impl.c         |  48 +---
 16 files changed, 453 insertions(+), 1315 deletions(-)
 create mode 100644 src/include/arch_x86_64/arch_intrinsics.h
 delete mode 100644 src/p25519/arch_x86_64/x86-64-arith.h
 delete mode 100644 src/p448/arch_x86_64/x86-64-arith.h
 delete mode 100644 src/p480/arch_x86_64/x86-64-arith.h

diff --git a/Makefile b/Makefile
index a3c3ff4..16eb948 100644
--- a/Makefile
+++ b/Makefile
@@ -146,10 +146,14 @@ COMPONENTS_OF_$(1) = $$(BUILD_OBJ)/$(1)_impl.o $$(BUILD_OBJ)/$(1)_arithmetic.o
 LIBCOMPONENTS += $$(COMPONENTS_OF_$(1))
 
 $$(BUILD_ASM)/$(1)_arithmetic.s: src/$(1)/f_arithmetic.c $$(HEADERS)
-	$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) -I $(BUILD_H)/$(1)/$(2) -S -c -o $$@ $$<
+	$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) \
+	-I $(BUILD_H)/$(1)/$(2) -I src/include/$(2) \
+	-S -c -o $$@ $$<
 
 $$(BUILD_ASM)/$(1)_impl.s: src/$(1)/$(2)/f_impl.c $$(HEADERS)
-	$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) -I $(BUILD_H)/$(1)/$(2) -S -c -o $$@ $$<
+	$$(CC) $$(CFLAGS) -I src/$(1) -I src/$(1)/$(2) -I $(BUILD_H)/$(1) \
+	-I $(BUILD_H)/$(1)/$(2) -I src/include/$(2) \
+	-S -c -o $$@ $$<
 endef
 
 ################################################################
@@ -166,18 +170,18 @@ $$(BUILD_C)/decaf_tables_$(1).c: $$(BUILD_IBIN)/decaf_gen_tables_$(1)
 
 $$(BUILD_ASM)/decaf_tables_$(1).s: $$(BUILD_C)/decaf_tables_$(1).c $$(HEADERS)
 	$$(CC) $$(CFLAGS) -S -c -o $$@ $$< \
-		-I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) \
+		-I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) -I src/include/$$(ARCH_FOR_$(2)) \
 		-I $(BUILD_H)/curve_$(1) -I $(BUILD_H)/$(2) -I $(BUILD_H)/$(2)/$$(ARCH_FOR_$(2))
 
 $$(BUILD_ASM)/decaf_gen_tables_$(1).s: src/decaf_gen_tables.c $$(HEADERS)
 	$$(CC) $$(CFLAGS) \
-		-I src/curve_$(1) -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) \
+		-I src/curve_$(1) -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) -I src/include/$$(ARCH_FOR_$(2)) \
 		-I $(BUILD_H)/curve_$(1) -I $(BUILD_H)/$(2) -I $(BUILD_H)/$(2)/$$(ARCH_FOR_$(2)) \
 		-S -c -o $$@ $$<
 
 $$(BUILD_ASM)/decaf_$(1).s: src/decaf.c $$(HEADERS)
 	$$(CC) $$(CFLAGS) \
-		-I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) \
+		-I src/curve_$(1)/ -I src/$(2) -I src/$(2)/$$(ARCH_FOR_$(2)) -I src/include/$$(ARCH_FOR_$(2)) \
 		-I $(BUILD_H)/curve_$(1) -I $(BUILD_H)/$(2) -I $(BUILD_H)/$(2)/$$(ARCH_FOR_$(2)) \
 		-S -c -o $$@ $$<
 
diff --git a/src/include/arch_x86_64/arch_intrinsics.h b/src/include/arch_x86_64/arch_intrinsics.h
new file mode 100644
index 0000000..d2b03e1
--- /dev/null
+++ b/src/include/arch_x86_64/arch_intrinsics.h
@@ -0,0 +1,306 @@
+/* Copyright (c) 2014-2016 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+#ifndef __ARCH_X86_64_ARCH_INTRINSICS_H__
+#define __ARCH_X86_64_ARCH_INTRINSICS_H__
+
+#include <stdint.h>
+
+/* FUTURE: non x86-64 versions of these.
+ * FUTURE: autogenerate
+ */
+
+static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
+  uint64_t c,d;
+  #ifndef __BMI2__
+      __asm__ volatile
+          ("movq %[a], %%rax;"
+           "mulq %[b];"
+           : [c]"=a"(c), [d]"=d"(d)
+           : [b]"m"(*b), [a]"m"(*a)
+           : "cc");
+  #else
+      __asm__ volatile
+          ("movq %[a], %%rdx;"
+           "mulx %[b], %[c], %[d];"
+           : [c]"=r"(c), [d]"=r"(d)
+           : [b]"m"(*b), [a]"m"(*a)
+           : "rdx");
+  #endif
+  return (((__uint128_t)(d))<<64) | c;
+}
+
+static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
+  uint64_t c,d;
+  #ifndef __BMI2__
+      __asm__ volatile
+          ("movq %[a], %%rax;"
+           "mulq %[b];"
+           : [c]"=a"(c), [d]"=d"(d)
+           : [b]"m"(*b), [a]"r"(a)
+           : "cc");
+  #else
+      __asm__ volatile
+          ("mulx %[b], %[c], %[d];"
+           : [c]"=r"(c), [d]"=r"(d)
+           : [b]"m"(*b), [a]"d"(a));
+  #endif
+  return (((__uint128_t)(d))<<64) | c;
+}
+
+static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b) {
+  uint64_t c,d;
+  #ifndef __BMI2__
+      __asm__ volatile
+          ("mulq %[b];"
+           : [c]"=a"(c), [d]"=d"(d)
+           : [b]"r"(b), "a"(a)
+           : "cc");
+  #else
+      __asm__ volatile
+          ("mulx %[b], %[c], %[d];"
+           : [c]"=r"(c), [d]"=r"(d)
+           : [b]"r"(b), [a]"d"(a));
+  #endif
+  return (((__uint128_t)(d))<<64) | c;
+}
+
+static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
+  uint64_t c,d;
+  #ifndef __BMI2__
+      __asm__ volatile
+          ("movq %[a], %%rax; "
+           "addq %%rax, %%rax; "
+           "mulq %[b];"
+           : [c]"=a"(c), [d]"=d"(d)
+           : [b]"m"(*b), [a]"m"(*a)
+           : "cc");
+  #else
+      __asm__ volatile
+          ("movq %[a], %%rdx;"
+           "leaq (,%%rdx,2), %%rdx;"
+           "mulx %[b], %[c], %[d];"
+           : [c]"=r"(c), [d]"=r"(d)
+           : [b]"m"(*b), [a]"m"(*a)
+           : "rdx");
+  #endif
+  return (((__uint128_t)(d))<<64) | c;
+}
+
+static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  
+  #ifdef __BMI2__
+      uint64_t c,d;
+      __asm__ volatile
+          ("movq %[a], %%rdx; "
+           "mulx %[b], %[c], %[d]; "
+           "addq %[c], %[lo]; "
+           "adcq %[d], %[hi]; "
+           : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+           : [b]"m"(*b), [a]"m"(*a)
+           : "rdx", "cc");
+  #else
+      __asm__ volatile
+          ("movq %[a], %%rax; "
+           "mulq %[b]; "
+           "addq %%rax, %[lo]; "
+           "adcq %%rdx, %[hi]; "
+           : [lo]"+r"(lo), [hi]"+r"(hi)
+           : [b]"m"(*b), [a]"m"(*a)
+           : "rax", "rdx", "cc");
+  #endif
+  
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+}
+
+static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  uint64_t lo2 = *acc2, hi2 = *acc2>>64;
+  
+  #ifdef __BMI2__
+      uint64_t c,d;
+      __asm__ volatile
+          ("movq %[a], %%rdx; "
+           "mulx %[b], %[c], %[d]; "
+           "addq %[c], %[lo]; "
+           "adcq %[d], %[hi]; "
+           "addq %[c], %[lo2]; "
+           "adcq %[d], %[hi2]; "
+           : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
+           : [b]"m"(*b), [a]"m"(*a)
+           : "rdx", "cc");
+  #else
+      __asm__ volatile
+          ("movq %[a], %%rax; "
+           "mulq %[b]; "
+           "addq %%rax, %[lo]; "
+           "adcq %%rdx, %[hi]; "
+           "addq %%rax, %[lo2]; "
+           "adcq %%rdx, %[hi2]; "
+           : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
+           : [b]"m"(*b), [a]"m"(*a)
+           : "rax", "rdx", "cc");
+  #endif
+  
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+  *acc2 = (((__uint128_t)(hi2))<<64) | lo2;
+}
+
+static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  
+  #ifdef __BMI2__
+      uint64_t c,d;
+      __asm__ volatile
+          ("mulx %[b], %[c], %[d]; "
+           "addq %[c], %[lo]; "
+           "adcq %[d], %[hi]; "
+           : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+           : [b]"m"(*b), [a]"d"(a)
+           : "cc");
+  #else
+      __asm__ volatile
+          ("movq %[a], %%rax; "
+           "mulq %[b]; "
+           "addq %%rax, %[lo]; "
+           "adcq %%rdx, %[hi]; "
+           : [lo]"+r"(lo), [hi]"+r"(hi)
+           : [b]"m"(*b), [a]"r"(a)
+           : "rax", "rdx", "cc");
+  #endif
+  
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+}
+
+static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  
+  #ifdef __BMI2__
+      uint64_t c,d;
+      __asm__ volatile
+          ("mulx %[b], %[c], %[d]; "
+           "addq %[c], %[lo]; "
+           "adcq %[d], %[hi]; "
+           : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+           : [b]"r"(b), [a]"d"(a)
+           : "cc");
+  #else
+      __asm__ volatile
+          ("mulq %[b]; "
+           "addq %%rax, %[lo]; "
+           "adcq %%rdx, %[hi]; "
+           : [lo]"+r"(lo), [hi]"+r"(hi)
+           : [b]"r"(b), "a"(a)
+           : "rax", "rdx", "cc");
+  #endif
+  
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+}
+
+static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  
+  #ifdef __BMI2__
+      uint64_t c,d;
+      __asm__ volatile
+          ("movq %[a], %%rdx; "
+           "addq %%rdx, %%rdx; "
+           "mulx %[b], %[c], %[d]; "
+           "addq %[c], %[lo]; "
+           "adcq %[d], %[hi]; "
+           : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+           : [b]"m"(*b), [a]"m"(*a)
+           : "rdx", "cc");
+  #else
+      __asm__ volatile
+          ("movq %[a], %%rax; "
+           "addq %%rax, %%rax; "
+           "mulq %[b]; "
+           "addq %%rax, %[lo]; "
+           "adcq %%rdx, %[hi]; "
+           : [lo]"+r"(lo), [hi]"+r"(hi)
+           : [b]"m"(*b), [a]"m"(*a)
+           : "rax", "rdx", "cc");
+  #endif
+  
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+}
+
+static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  #ifdef __BMI2__
+      uint64_t c,d;
+      __asm__ volatile
+          ("movq %[a], %%rdx; "
+           "mulx %[b], %[c], %[d]; "
+           "subq %[c], %[lo]; "
+           "sbbq %[d], %[hi]; "
+           : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+           : [b]"m"(*b), [a]"m"(*a)
+           : "rdx", "cc");
+  #else
+      __asm__ volatile
+          ("movq %[a], %%rax; "
+           "mulq %[b]; "
+           "subq %%rax, %[lo]; "
+           "sbbq %%rdx, %[hi]; "
+           : [lo]"+r"(lo), [hi]"+r"(hi)
+           : [b]"m"(*b), [a]"m"(*a)
+           : "rax", "rdx", "cc");
+  #endif
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+}
+
+static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
+  uint64_t lo = *acc, hi = *acc>>64;
+  #ifdef __BMI2__
+      uint64_t c,d;
+      __asm__ volatile
+          ("movq %[a], %%rdx; "
+           "addq %%rdx, %%rdx; "
+           "mulx %[b], %[c], %[d]; "
+           "subq %[c], %[lo]; "
+           "sbbq %[d], %[hi]; "
+           : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+           : [b]"m"(*b), [a]"m"(*a)
+           : "rdx", "cc");
+  #else
+      __asm__ volatile
+          ("movq %[a], %%rax; "
+           "addq %%rax, %%rax; "
+           "mulq %[b]; "
+           "subq %%rax, %[lo]; "
+           "sbbq %%rdx, %[hi]; "
+           : [lo]"+r"(lo), [hi]"+r"(hi)
+           : [b]"m"(*b), [a]"m"(*a)
+           : "rax", "rdx", "cc");
+  #endif
+  *acc = (((__uint128_t)(hi))<<64) | lo;
+  
+}
+
+static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
+  uint64_t c,d, lo = *acc, hi = *acc>>64;
+  __asm__ volatile
+      ("movq %[a], %%rdx; "
+       "mulx %[b], %[c], %[d]; "
+       "subq %[lo], %[c]; "
+       "sbbq %[hi], %[d]; "
+       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
+       : [b]"m"(*b), [a]"m"(*a)
+       : "rdx", "cc");
+  *acc = (((__uint128_t)(d))<<64) | c;
+}
+
+static __inline__ uint64_t is_zero(uint64_t x) {
+  __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
+  return ~x;
+}
+
+static inline uint64_t shrld(__uint128_t x, int n) {
+    return x>>n;
+}
+
+#endif /* __ARCH_X86_64_ARCH_INTRINSICS_H__ */
diff --git a/src/include/word.h b/src/include/word.h
index 0ba17ee..b44a92e 100644
--- a/src/include/word.h
+++ b/src/include/word.h
@@ -9,6 +9,7 @@
 #define _XOPEN_SOURCE 600
 
 #include "arch_config.h"
+#include "arch_intrinsics.h"
 
 #include <decaf/common.h>
 
@@ -32,7 +33,6 @@
 #endif
 
 #if (WORD_BITS == 64)
-    typedef uint32_t hword_t;
     typedef uint64_t word_t, mask_t;
     typedef __uint128_t dword_t;
     typedef int32_t hsword_t;
@@ -50,7 +50,6 @@
     #define letohWORD letoh64
     #define SC_LIMB(x) (x##ull)
 #elif (WORD_BITS == 32)
-    typedef uint16_t hword_t;
     typedef uint32_t word_t, mask_t;
     typedef uint64_t dword_t;
     typedef int16_t hsword_t;
diff --git a/src/p25519/arch_ref64/f_impl.c b/src/p25519/arch_ref64/f_impl.c
index 8f24012..7afd485 100644
--- a/src/p25519/arch_ref64/f_impl.c
+++ b/src/p25519/arch_ref64/f_impl.c
@@ -16,12 +16,7 @@ static __inline__ uint64_t is_zero(uint64_t a) {
     return (((__uint128_t)a)-1)>>64;
 }
 
-void
-gf_25519_mul (
-    gf_25519_t __restrict__ cs,
-    const gf_25519_t as,
-    const gf_25519_t bs
-) {
+void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
     
     uint64_t bh[4];
@@ -51,12 +46,7 @@ gf_25519_mul (
     c[1] += accum;
 }
 
-void
-gf_25519_mulw (
-    gf_25519_t __restrict__ cs,
-    const gf_25519_t as,
-    uint64_t b
-) {
+void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
     const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
     int i;
     
@@ -78,18 +68,11 @@ gf_25519_mulw (
     c[1] += accum;
 }
 
-void
-gf_25519_sqr (
-    gf_25519_t __restrict__ cs,
-    const gf_25519_t as
-) {
-    gf_25519_mul(cs,as,as); // PERF
+void gf_sqr (gf_s *__restrict__ cs, const gf as) {
+    gf_mul(cs,as,as); // PERF
 }
 
-void
-gf_25519_strong_reduce (
-    gf_25519_t a
-) {
+void gf_strong_reduce (gf a) {
     uint64_t mask = (1ull<<51)-1;
 
     /* first, clear high */
@@ -127,15 +110,11 @@ gf_25519_strong_reduce (
     assert(is_zero(carry + scarry));
 }
 
-void
-gf_25519_serialize (
-    uint8_t serial[32],
-    const struct gf_25519_t x
-) {
+void gf_serialize (uint8_t serial[32], const struct gf x) {
     int i,j;
-    gf_25519_t red;
-    gf_25519_copy(&red, x);
-    gf_25519_t trong_reduce(&red);
+    gf red;
+    gf_copy(&red, x);
+    gf_strong_reduce(&red);
     uint64_t *r = red.limb;
     uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12};
     for (i=0; i<4; i++) {
@@ -146,11 +125,7 @@ gf_25519_serialize (
     }
 }
 
-mask_t
-gf_25519_deserialize (
-    gf_25519_t x,
-    const uint8_t serial[32]
-) {
+mask_t gf_deserialize (gf x, const uint8_t serial[32]) {
     int i,j;
     uint64_t ser64[4], mask = ((1ull<<51)-1);
     for (i=0; i<4; i++) {
diff --git a/src/p25519/arch_x86_64/f_impl.c b/src/p25519/arch_x86_64/f_impl.c
index 377252c..168dbd5 100644
--- a/src/p25519/arch_x86_64/f_impl.c
+++ b/src/p25519/arch_x86_64/f_impl.c
@@ -3,18 +3,8 @@
  */
 
 #include "f_field.h"
-#include "x86-64-arith.h"
 
-static inline uint64_t shr(__uint128_t x, int n) {
-    return x>>n;
-}
-
-void
-gf_25519_mul (
-    gf_25519_s *__restrict__ cs,
-    const gf_25519_t as,
-    const gf_25519_t bs
-) {
+void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     const uint64_t *a = as->limb, *b = bs->limb, mask = ((1ull<<51)-1);
     uint64_t *c = cs->limb;
     
@@ -48,12 +38,12 @@ gf_25519_mul (
     mac_rm(&accum2, ai, &b[3]);
     
     uint64_t c0 = accum0 & mask;
-    accum1 += shr(accum0, 51);
+    accum1 += shrld(accum0, 51);
     uint64_t c1 = accum1 & mask;
-    accum2 += shr(accum1, 51);
+    accum2 += shrld(accum1, 51);
     c[2] = accum2 & mask;
     
-    accum0 = shr(accum2, 51);
+    accum0 = shrld(accum2, 51);
 
     mac_rm(&accum0, ai, &b[4]);
     
@@ -77,7 +67,7 @@ gf_25519_mul (
     mac_rm(&accum1, ai, &b[0]);
     
     c[3] = accum0 & mask;
-    accum1 += shr(accum0, 51);
+    accum1 += shrld(accum0, 51);
     c[4] = accum1 & mask;
     
     /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64
@@ -85,17 +75,13 @@ gf_25519_mul (
      * PERF: good enough to fit into uint64_t?
      */
     
-    uint64_t a1 = shr(accum1,51);
+    uint64_t a1 = shrld(accum1,51);
     accum1 = (__uint128_t)a1 * 19 + c0;
     c[0] = accum1 & mask;
-    c[1] = c1 + shr(accum1,51);
+    c[1] = c1 + shrld(accum1,51);
 }
 
-void
-gf_25519_sqr (
-    gf_25519_s *__restrict__ cs,
-    const gf_25519_t as
-) {
+void gf_sqr (gf_s *__restrict__ cs, const gf as) {
     const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
     uint64_t *c = cs->limb;
     
@@ -122,9 +108,9 @@ gf_25519_sqr (
     mac_rm(&accum2, ai, &a[4]);
     
     uint64_t c0 = accum0 & mask;
-    accum1 += shr(accum0, 51);
+    accum1 += shrld(accum0, 51);
     uint64_t c1 = accum1 & mask;
-    accum2 += shr(accum1, 51);
+    accum2 += shrld(accum1, 51);
     c[2] = accum2 & mask;
     
     accum0 = accum2 >> 51;
@@ -141,7 +127,7 @@ gf_25519_sqr (
     mac_rr(&accum1, a[2], a[2]);
     
     c[3] = accum0 & mask;
-    accum1 += shr(accum0, 51);
+    accum1 += shrld(accum0, 51);
     c[4] = accum1 & mask;
     
     /* 2^102 * 16 * 5 * 19 * (1+ep) >> 64
@@ -149,51 +135,43 @@ gf_25519_sqr (
      * PERF: good enough to fit into uint64_t?
      */
     
-    uint64_t a1 = shr(accum1,51);
+    uint64_t a1 = shrld(accum1,51);
     accum1 = (__uint128_t)a1 * 19 + c0;
     c[0] = accum1 & mask;
-    c[1] = c1 + shr(accum1,51);
+    c[1] = c1 + shrld(accum1,51);
 }
 
-void
-gf_25519_mulw (
-    gf_25519_s *__restrict__ cs,
-    const gf_25519_t as,
-    uint64_t b
-) {
+void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
     const uint64_t *a = as->limb, mask = ((1ull<<51)-1);
     uint64_t *c = cs->limb;
 
     __uint128_t accum = widemul_rm(b, &a[0]);
     uint64_t c0 = accum & mask;
-    accum = shr(accum,51);
+    accum = shrld(accum,51);
     
     mac_rm(&accum, b, &a[1]);
     uint64_t c1 = accum & mask;
-    accum = shr(accum,51);
+    accum = shrld(accum,51);
     
     mac_rm(&accum, b, &a[2]);
     c[2] = accum & mask;
-    accum = shr(accum,51);
+    accum = shrld(accum,51);
     
     mac_rm(&accum, b, &a[3]);
     c[3] = accum & mask;
-    accum = shr(accum,51);
+    accum = shrld(accum,51);
     
     mac_rm(&accum, b, &a[4]);
     c[4] = accum & mask;
 
-    accum = shr(accum,51);
+    accum = shrld(accum,51);
     accum = accum * 19 + c0;
     
     c[0] = accum & mask;
-    c[1] = c1 + shr(accum,51);
+    c[1] = c1 + shrld(accum,51);
 }
 
-void
-gf_25519_strong_reduce (
-    gf_25519_t a
-) {
+void gf_strong_reduce (gf a) {
     uint64_t mask = (1ull<<51)-1;
 
     /* first, clear high */
@@ -231,15 +209,11 @@ gf_25519_strong_reduce (
     assert(is_zero(carry + scarry));
 }
 
-void
-gf_25519_serialize (
-    uint8_t serial[32],
-    const gf_25519_t x
-) {
+void gf_serialize (uint8_t serial[32], const gf x) {
     int i,j;
-    gf_25519_t red;
-    gf_25519_copy(red, x);
-    gf_25519_strong_reduce(red);
+    gf red;
+    gf_copy(red, x);
+    gf_strong_reduce(red);
     uint64_t *r = red->limb;
     uint64_t ser64[4] = {r[0] | r[1]<<51, r[1]>>13|r[2]<<38, r[2]>>26|r[3]<<25, r[3]>>39|r[4]<<12};
     for (i=0; i<4; i++) {
@@ -250,11 +224,7 @@ gf_25519_serialize (
     }
 }
 
-mask_t
-gf_25519_deserialize (
-    gf_25519_t x,
-    const uint8_t serial[32]
-) {
+mask_t gf_deserialize (gf x, const uint8_t serial[32]) {
     int i,j;
     uint64_t ser64[4], mask = ((1ull<<51)-1);
     for (i=0; i<4; i++) {
diff --git a/src/p25519/arch_x86_64/x86-64-arith.h b/src/p25519/arch_x86_64/x86-64-arith.h
deleted file mode 100644
index 00fcc1e..0000000
--- a/src/p25519/arch_x86_64/x86-64-arith.h
+++ /dev/null
@@ -1,323 +0,0 @@
-/* Copyright (c) 2014 Cryptography Research, Inc.
- * Released under the MIT License.  See LICENSE.txt for license information.
- */
-
-#ifndef __X86_64_ARITH_H__
-#define __X86_64_ARITH_H__
-
-#include <stdint.h>
-
-/* TODO: non x86-64 versions of these.
- * FUTURE: autogenerate
- */
-
-static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
-  #ifndef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rax;"
-       "mulq %[b];"
-       : [c]"=a"(c), [d]"=d"(d)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "cc");
-  return (((__uint128_t)(d))<<64) | c;
-  #else
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rdx;"
-       "mulx %[b], %[c], %[d];"
-       : [c]"=r"(c), [d]"=r"(d)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx");
-  return (((__uint128_t)(d))<<64) | c;
-  #endif
-}
-
-static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
-  #ifndef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rax;"
-       "mulq %[b];"
-       : [c]"=a"(c), [d]"=d"(d)
-       : [b]"m"(*b), [a]"r"(a)
-       : "cc");
-  return (((__uint128_t)(d))<<64) | c;
-  #else
-  uint64_t c,d;
-  __asm__ volatile
-      ("mulx %[b], %[c], %[d];"
-       : [c]"=r"(c), [d]"=r"(d)
-       : [b]"m"(*b), [a]"d"(a));
-  return (((__uint128_t)(d))<<64) | c;
-  #endif
-}
-
-static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b) {
-  #ifndef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("mulq %[b];"
-       : [c]"=a"(c), [d]"=d"(d)
-       : [b]"r"(b), "a"(a)
-       : "cc");
-  return (((__uint128_t)(d))<<64) | c;
-  #else
-  uint64_t c,d;
-  __asm__ volatile
-      ("mulx %[b], %[c], %[d];"
-       : [c]"=r"(c), [d]"=r"(d)
-       : [b]"r"(b), [a]"d"(a));
-  return (((__uint128_t)(d))<<64) | c;
-  #endif
-}
-
-static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
-  #ifndef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rax; "
-       "addq %%rax, %%rax; "
-       "mulq %[b];"
-       : [c]"=a"(c), [d]"=d"(d)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "cc");
-  return (((__uint128_t)(d))<<64) | c;
-  #else
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rdx;"
-       "leaq (,%%rdx,2), %%rdx;"
-       "mulx %[b], %[c], %[d];"
-       : [c]"=r"(c), [d]"=r"(d)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx");
-  return (((__uint128_t)(d))<<64) | c;
-  #endif
-}
-
-static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  
-  #ifdef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rdx; "
-       "mulx %[b], %[c], %[d]; "
-       "addq %[c], %[lo]; "
-       "adcq %[d], %[hi]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx", "cc");
-  #else
-  __asm__ volatile
-      ("movq %[a], %%rax; "
-       "mulq %[b]; "
-       "addq %%rax, %[lo]; "
-       "adcq %%rdx, %[hi]; "
-       : [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rax", "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-}
-
-static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  uint64_t lo2 = *acc2, hi2 = *acc2>>64;
-  
-  #ifdef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rdx; "
-       "mulx %[b], %[c], %[d]; "
-       "addq %[c], %[lo]; "
-       "adcq %[d], %[hi]; "
-       "addq %[c], %[lo2]; "
-       "adcq %[d], %[hi2]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx", "cc");
-  #else
-  __asm__ volatile
-      ("movq %[a], %%rax; "
-       "mulq %[b]; "
-       "addq %%rax, %[lo]; "
-       "adcq %%rdx, %[hi]; "
-       "addq %%rax, %[lo2]; "
-       "adcq %%rdx, %[hi2]; "
-       : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rax", "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-  *acc2 = (((__uint128_t)(hi2))<<64) | lo2;
-}
-
-static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  
-  #ifdef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("mulx %[b], %[c], %[d]; "
-       "addq %[c], %[lo]; "
-       "adcq %[d], %[hi]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"d"(a)
-       : "cc");
-  #else
-  __asm__ volatile
-      ("movq %[a], %%rax; "
-       "mulq %[b]; "
-       "addq %%rax, %[lo]; "
-       "adcq %%rdx, %[hi]; "
-       : [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"r"(a)
-       : "rax", "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-}
-
-static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  
-  #ifdef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("mulx %[b], %[c], %[d]; "
-       "addq %[c], %[lo]; "
-       "adcq %[d], %[hi]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"r"(b), [a]"d"(a)
-       : "cc");
-  #else
-  __asm__ volatile
-      ("mulq %[b]; "
-       "addq %%rax, %[lo]; "
-       "adcq %%rdx, %[hi]; "
-       : [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"r"(b), "a"(a)
-       : "rax", "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-}
-
-static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  
-  #ifdef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rdx; "
-       "addq %%rdx, %%rdx; "
-       "mulx %[b], %[c], %[d]; "
-       "addq %[c], %[lo]; "
-       "adcq %[d], %[hi]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx", "cc");
-  #else
-  __asm__ volatile
-      ("movq %[a], %%rax; "
-       "addq %%rax, %%rax; "
-       "mulq %[b]; "
-       "addq %%rax, %[lo]; "
-       "adcq %%rdx, %[hi]; "
-       : [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rax", "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-}
-
-static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  #ifdef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rdx; "
-       "mulx %[b], %[c], %[d]; "
-       "subq %[c], %[lo]; "
-       "sbbq %[d], %[hi]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx", "cc");
-  #else
-  __asm__ volatile
-      ("movq %[a], %%rax; "
-       "mulq %[b]; "
-       "subq %%rax, %[lo]; "
-       "sbbq %%rdx, %[hi]; "
-       : [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rax", "rdx", "cc");
-  #endif
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-}
-
-static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  #ifdef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rdx; "
-       "addq %%rdx, %%rdx; "
-       "mulx %[b], %[c], %[d]; "
-       "subq %[c], %[lo]; "
-       "sbbq %[d], %[hi]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx", "cc");
-  #else
-  __asm__ volatile
-      ("movq %[a], %%rax; "
-       "addq %%rax, %%rax; "
-       "mulq %[b]; "
-       "subq %%rax, %[lo]; "
-       "sbbq %%rdx, %[hi]; "
-       : [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rax", "rdx", "cc");
-  #endif
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-  
-}
-
-static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
-  uint64_t c,d, lo = *acc, hi = *acc>>64;
-  __asm__ volatile
-      ("movq %[a], %%rdx; "
-       "mulx %[b], %[c], %[d]; "
-       "subq %[lo], %[c]; "
-       "sbbq %[hi], %[d]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx", "cc");
-  *acc = (((__uint128_t)(d))<<64) | c;
-}
-
-static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) {
-  return ((__uint128_t)(a)) * b;
-}
-
-static __inline__ __int128_t widemuls(int64_t a, int64_t b) {
-  return ((__int128_t)(a)) * b;
-}
- 
-static __inline__ uint64_t opacify(uint64_t x) {
-  __asm__ volatile("" : "+r"(x));
-  return x;
-}
-
-static __inline__ mask_t is_zero(uint64_t x) {
-  __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
-  return ~x;
-}
-
-#endif /* __X86_64_ARITH_H__ */
diff --git a/src/p448/arch_32/f_impl.c b/src/p448/arch_32/f_impl.c
index bd900c6..739b1fb 100644
--- a/src/p448/arch_32/f_impl.c
+++ b/src/p448/arch_32/f_impl.c
@@ -4,28 +4,20 @@
 
 #include "f_field.h"
 
-static inline mask_t __attribute__((always_inline))
-is_zero (
-    word_t x
-) {
+static inline mask_t is_zero (word_t x) {
     dword_t xx = x;
     xx--;
     return xx >> WORD_BITS;
 }
 
-static uint64_t widemul_32 (
+static uint64_t widemul (
     const uint32_t a,
     const uint32_t b
 ) {
     return ((uint64_t)a)* b;
 }
 
-void
-gf_448_mul (
-    gf_448_s *__restrict__ cs,
-    const gf_448_t as,
-    const gf_448_t bs
-) { 
+void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) { 
     const uint32_t *a = as->limb, *b = bs->limb;
     uint32_t *c = cs->limb;
 
@@ -44,9 +36,9 @@ gf_448_mul (
         accum2 = 0;
     
         for (i=0; i<=j; i++) {      
-            accum2 += widemul_32(a[j-i],b[i]);
-            accum1 += widemul_32(aa[j-i],bb[i]);
-            accum0 += widemul_32(a[8+j-i], b[8+i]);
+            accum2 += widemul(a[j-i],b[i]);
+            accum1 += widemul(aa[j-i],bb[i]);
+            accum0 += widemul(a[8+j-i], b[8+i]);
         }
         
         accum1 -= accum2;
@@ -54,9 +46,9 @@ gf_448_mul (
         accum2 = 0;
         
         for (; i<8; i++) {
-            accum0 -= widemul_32(a[8+j-i], b[i]);
-            accum2 += widemul_32(aa[8+j-i], bb[i]);
-            accum1 += widemul_32(a[16+j-i], b[8+i]);
+            accum0 -= widemul(a[8+j-i], b[i]);
+            accum2 += widemul(aa[8+j-i], bb[i]);
+            accum1 += widemul(a[16+j-i], b[8+i]);
         }
 
         accum1 += accum2;
@@ -81,12 +73,7 @@ gf_448_mul (
     c[1] += ((uint32_t)(accum1));
 }
 
-void
-gf_448_mulw (
-    gf_448_s *__restrict__ cs,
-    const gf_448_t as,
-    uint64_t b
-) {
+void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
     const uint32_t bhi = b>>28, blo = b & ((1<<28)-1);
     
     const uint32_t *a = as->limb;
@@ -97,20 +84,20 @@ gf_448_mulw (
 
     int i;
 
-    accum0 = widemul_32(blo, a[0]);
-    accum8 = widemul_32(blo, a[8]);
-    accum0 += widemul_32(bhi, a[15]);
-    accum8 += widemul_32(bhi, a[15] + a[7]);
+    accum0 = widemul(blo, a[0]);
+    accum8 = widemul(blo, a[8]);
+    accum0 += widemul(bhi, a[15]);
+    accum8 += widemul(bhi, a[15] + a[7]);
 
     c[0] = accum0 & mask; accum0 >>= 28;
     c[8] = accum8 & mask; accum8 >>= 28;
     
     for (i=1; i<8; i++) {
-        accum0 += widemul_32(blo, a[i]);
-        accum8 += widemul_32(blo, a[i+8]);
+        accum0 += widemul(blo, a[i]);
+        accum8 += widemul(blo, a[i+8]);
         
-        accum0 += widemul_32(bhi, a[i-1]);
-        accum8 += widemul_32(bhi, a[i+7]);
+        accum0 += widemul(bhi, a[i-1]);
+        accum8 += widemul(bhi, a[i+7]);
 
         c[i] = accum0 & mask; accum0 >>= 28;
         c[i+8] = accum8 & mask; accum8 >>= 28;
@@ -125,18 +112,11 @@ gf_448_mulw (
     c[1] += accum8 >> 28;
 }
 
-void
-gf_448_sqr (
-    gf_448_s *__restrict__ cs,
-    const gf_448_t as
-) {
-    gf_448_mul(cs,as,as); /* PERF */
+void gf_sqr (gf_s *__restrict__ cs, const gf as) {
+    gf_mul(cs,as,as); /* PERF */
 }
 
-void
-gf_448_strong_reduce (
-    gf_448_t a
-) {
+void gf_strong_reduce (gf a) {
     word_t mask = (1ull<<28)-1;
 
     /* first, clear high */
@@ -176,15 +156,11 @@ gf_448_strong_reduce (
     assert(is_zero(carry + scarry));
 }
 
-void
-gf_448_serialize (
-    uint8_t *serial,
-    const gf_448_t x
-) {
+void gf_serialize (uint8_t *serial, const gf x) {
     int i,j;
-    gf_448_t red;
-    gf_448_copy(red, x);
-    gf_448_strong_reduce(red);
+    gf red;
+    gf_copy(red, x);
+    gf_strong_reduce(red);
     for (i=0; i<8; i++) {
         uint64_t limb = red->limb[2*i] + (((uint64_t)red->limb[2*i+1])<<28);
         for (j=0; j<7; j++) {
@@ -195,11 +171,7 @@ gf_448_serialize (
     }
 }
 
-mask_t
-gf_448_deserialize (
-    gf_448_t x,
-    const uint8_t serial[56]
-) {
+mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
     int i,j;
     for (i=0; i<8; i++) {
         uint64_t out = 0;
diff --git a/src/p448/arch_arm_32/f_impl.c b/src/p448/arch_arm_32/f_impl.c
index ea831f3..62eda0f 100644
--- a/src/p448/arch_arm_32/f_impl.c
+++ b/src/p448/arch_arm_32/f_impl.c
@@ -4,16 +4,13 @@
 
 #include "f_field.h"
 
-static inline mask_t __attribute__((always_inline))
-is_zero (
-    word_t x
-) {
+static inline mask_t is_zero (word_t x) {
     dword_t xx = x;
     xx--;
     return xx >> WORD_BITS;
 }
 
-static uint64_t widemul_32 (
+static uint64_t widemul (
     const uint32_t a,
     const uint32_t b
 ) {
@@ -97,12 +94,7 @@ smull2 (
 #endif
 }
 
-void
-gf_448_mul (
-    gf_448_s *__restrict__ cs,
-    const gf_448_t as,
-    const gf_448_t bs
-) {
+void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     
     const uint32_t *a = as->limb, *b = bs->limb;
     uint32_t *c = cs->limb;
@@ -448,11 +440,7 @@ gf_448_mul (
     c[1] += ((uint32_t)(accum1));
 }
 
-void
-gf_448_sqr (
-    gf_448_s *__restrict__ cs,
-    const gf_448_t as
-) {
+void gf_sqr (gf_s *__restrict__ cs, const gf as) {
     const uint32_t *a = as->limb;
     uint32_t *c = cs->limb;
 
@@ -746,10 +734,9 @@ gf_448_sqr (
     c[1] += ((uint32_t)(accum1));
 }
 
-void
-gf_448_mulw (
-    gf_448_s *__restrict__ cs,
-    const gf_448_t as,
+void gf_mulw (
+    gf_s *__restrict__ cs,
+    const gf as,
     uint64_t b
 ) {
     uint32_t mask = (1ull<<28)-1;  
@@ -763,8 +750,8 @@ gf_448_mulw (
     int i;
 
     uint32_t c0, c8, n0, n8;
-    accum0 = widemul_32(bhi, a[15]);
-    accum8 = widemul_32(bhi, a[15] + a[7]);
+    accum0 = widemul(bhi, a[15]);
+    accum8 = widemul(bhi, a[15] + a[7]);
     c0 = a[0]; c8 = a[8];
     smlal(&accum0, blo, c0);
     smlal(&accum8, blo, c8);
@@ -860,9 +847,8 @@ gf_448_mulw (
     c[1] += accum8 >> 28;
 }
 
-void
-gf_448_strong_reduce (
-    gf_448_t a
+void gf_strong_reduce (
+    gf a
 ) {
     word_t mask = (1ull<<28)-1;
 
@@ -903,15 +889,14 @@ gf_448_strong_reduce (
     assert(is_zero(carry + scarry));
 }
 
-void
-gf_448_serialize (
+void gf_serialize (
     uint8_t *serial,
-    const gf_448_t x
+    const gf x
 ) {
     int i,j;
-    gf_448_t red;
-    gf_448_copy(red, x);
-    gf_448_strong_reduce(red);
+    gf red;
+    gf_copy(red, x);
+    gf_strong_reduce(red);
     for (i=0; i<8; i++) {
         uint64_t limb = red->limb[2*i] + (((uint64_t)red->limb[2*i+1])<<28);
         for (j=0; j<7; j++) {
@@ -923,8 +908,8 @@ gf_448_serialize (
 }
 
 mask_t
-gf_448_deserialize (
-    gf_448_t x,
+gf_deserialize (
+    gf x,
     const uint8_t serial[56]
 ) {
     int i,j;
diff --git a/src/p448/arch_neon_experimental/f_impl.c b/src/p448/arch_neon_experimental/f_impl.c
index 002ef40..1225f5e 100644
--- a/src/p448/arch_neon_experimental/f_impl.c
+++ b/src/p448/arch_neon_experimental/f_impl.c
@@ -67,12 +67,7 @@ smull2 (
     *acc = (int64_t)(int32_t)a * (int64_t)(int32_t)b * 2;
 }
 
-void
-gf_448_mul (
-    gf_448_s *__restrict__ cs,
-    const gf_448_t as,
-    const gf_448_t bs
-) {
+void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     #define _bl0 "q0"
     #define _bl0_0 "d0"
     #define _bl0_1 "d1"
@@ -366,11 +361,7 @@ gf_448_mul (
     );
 }
 
-void
-gf_448_sqr (
-    gf_448_s *__restrict__ cs,
-    const gf_448_t bs
-) {
+void gf_sqr (gf_s *__restrict__ cs, const gf bs) {
     int32x2_t *vc = (int32x2_t*) cs->limb;
 
     __asm__ __volatile__ (
@@ -567,12 +558,7 @@ gf_448_sqr (
     );
 }
 
-void
-gf_448_mulw (
-    gf_448_s *__restrict__ cs,
-    const gf_448_t as,
-    uint64_t b
-) { 
+void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) { 
     uint32x2_t vmask = {(1<<28) - 1, (1<<28)-1};
     
     uint64x2_t accum;
@@ -618,10 +604,7 @@ gf_448_mulw (
 }
 
 /* PERF: vectorize? */
-void
-gf_448_strong_reduce (
-    gf_448_t a
-) { 
+void gf_strong_reduce (gf a) { 
     word_t mask = (1ull<<28)-1;
 
     /* first, clear high */
@@ -661,15 +644,11 @@ gf_448_strong_reduce (
     assert(is_zero(carry + scarry));
 }
 
-void
-gf_448_serialize (
-    uint8_t *serial,
-    const gf_448_t x
-) {
+void gf_serialize (uint8_t *serial, const gf x) {
     int i,j;
-    gf_448_t red;
-    gf_448_copy(red, x);
-    gf_448_strong_reduce(red);
+    gf red;
+    gf_copy(red, x);
+    gf_strong_reduce(red);
     
     for (i=0; i<8; i++) {
         uint64_t limb = red->limb[LIMBPERM(2*i)] + (((uint64_t)red->limb[LIMBPERM(2*i+1)])<<28);
@@ -681,11 +660,7 @@ gf_448_serialize (
     }
 }
 
-mask_t
-gf_448_deserialize (
-    gf_448_t x,
-    const uint8_t serial[56]
-) {
+mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
     int i,j;
     for (i=0; i<8; i++) {
         uint64_t out = 0;
diff --git a/src/p448/arch_ref64/f_impl.c b/src/p448/arch_ref64/f_impl.c
index 88bef61..74aeeb1 100644
--- a/src/p448/arch_ref64/f_impl.c
+++ b/src/p448/arch_ref64/f_impl.c
@@ -16,12 +16,7 @@ static __inline__ uint64_t is_zero(uint64_t a) {
     return (((__uint128_t)a)-1)>>64;
 }
 
-void
-gf_448_mul (
-    gf_448_s *__restrict__ cs,
-    const gf_448_t as,
-    const gf_448_t bs
-) {
+void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     const uint64_t *a = as->limb, *b = bs->limb;
     uint64_t *c = cs->limb;
 
@@ -182,12 +177,7 @@ gf_448_mul (
     c[1] += ((uint64_t)(accum1));
 }
 
-void
-gf_448_mulw (
-    gf_448_s *__restrict__ cs,
-    const gf_448_t as,
-    uint64_t b
-) {
+void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
     const uint64_t *a = as->limb;
     uint64_t *c = cs->limb;
 
@@ -211,11 +201,7 @@ gf_448_mulw (
     c[1] += accum4 >> 56;
 }
 
-void
-gf_448_sqr (
-    gf_448_s *__restrict__ cs,
-    const gf_448_t as
-) {
+void gf_sqr (gf_s *__restrict__ cs, const gf as) {
     const uint64_t *a = as->limb;
     uint64_t *c = cs->limb;
 
@@ -326,10 +312,7 @@ gf_448_sqr (
     c[0] += ((uint64_t)(accum1));
 }
 
-void
-gf_448_strong_reduce (
-    gf_448_t a
-) {
+void gf_strong_reduce (gf a) {
     uint64_t mask = (1ull<<56)-1;
 
     /* first, clear high */
@@ -369,15 +352,11 @@ gf_448_strong_reduce (
     assert(is_zero(carry + scarry));
 }
 
-void
-gf_448_serialize (
-    uint8_t *serial,
-    const gf_448_t x
-) {
+void gf_serialize (uint8_t *serial, const gf x) {
     int i,j;
-    gf_448_t red;
-    gf_448_copy(red, x);
-    gf_448_strong_reduce(red);
+    gf red;
+    gf_copy(red, x);
+    gf_strong_reduce(red);
     for (i=0; i<8; i++) {
         for (j=0; j<7; j++) {
             serial[7*i+j] = red->limb[i];
@@ -387,11 +366,7 @@ gf_448_serialize (
     }
 }
 
-mask_t
-gf_448_deserialize (
-    gf_448_t x,
-    const uint8_t serial[56]
-) {
+mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
     int i,j;
     for (i=0; i<8; i++) {
         uint64_t out = 0;
diff --git a/src/p448/arch_x86_64/f_impl.c b/src/p448/arch_x86_64/f_impl.c
index 9c02d84..07744fa 100644
--- a/src/p448/arch_x86_64/f_impl.c
+++ b/src/p448/arch_x86_64/f_impl.c
@@ -3,14 +3,8 @@
  */
 
 #include "f_field.h"
-#include "x86-64-arith.h"
-
-void
-gf_448_mul (
-    gf_448_s *__restrict__ cs,
-    const gf_448_t as,
-    const gf_448_t bs
-) {
+
+void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     const uint64_t *a = as->limb, *b = bs->limb;
     uint64_t *c = cs->limb;
 
@@ -145,12 +139,7 @@ gf_448_mul (
     c[0] += ((uint64_t)(accum1));
 }
 
-void
-gf_448_mulw (
-    gf_448_s *__restrict__ cs,
-    const gf_448_t as,
-    uint64_t b
-) {
+void gf_mulw (gf_s *__restrict__ cs, const gf as, uint64_t b) {
     const uint64_t *a = as->limb;
     uint64_t *c = cs->limb;
 
@@ -190,11 +179,7 @@ gf_448_mulw (
     c[1] += accum4 >> 56;
 }
 
-void
-gf_448_sqr (
-    gf_448_s *__restrict__ cs,
-    const gf_448_t as
-) {
+void gf_sqr (gf_s *__restrict__ cs, const gf as) {
     const uint64_t *a = as->limb;
     uint64_t *c = cs->limb;
 
@@ -305,10 +290,7 @@ gf_448_sqr (
     c[0] += ((uint64_t)(accum1));
 }
 
-void
-gf_448_strong_reduce (
-    gf_448_t a
-) {
+void gf_strong_reduce (gf a) {
     uint64_t mask = (1ull<<56)-1;
 
     /* first, clear high */
@@ -348,15 +330,11 @@ gf_448_strong_reduce (
     assert(is_zero(carry + scarry));
 }
 
-void
-gf_448_serialize (
-    uint8_t *serial,
-    const gf_448_t x
-) {
+void gf_serialize (uint8_t *serial, const gf x) {
     int i,j;
-    gf_448_t red;
-    gf_448_copy(red, x);
-    gf_448_strong_reduce(red);
+    gf red;
+    gf_copy(red, x);
+    gf_strong_reduce(red);
     for (i=0; i<8; i++) {
         for (j=0; j<7; j++) {
             serial[7*i+j] = red->limb[i];
@@ -366,11 +344,7 @@ gf_448_serialize (
     }
 }
 
-mask_t
-gf_448_deserialize (
-    gf_448_t x,
-    const uint8_t serial[56]
-) {
+mask_t gf_deserialize (gf x, const uint8_t serial[56]) {
     int i,j;
     for (i=0; i<8; i++) {
         word_t out = 0;
diff --git a/src/p448/arch_x86_64/x86-64-arith.h b/src/p448/arch_x86_64/x86-64-arith.h
deleted file mode 100644
index 4f38723..0000000
--- a/src/p448/arch_x86_64/x86-64-arith.h
+++ /dev/null
@@ -1,323 +0,0 @@
-/* Copyright (c) 2014 Cryptography Research, Inc.
- * Released under the MIT License.  See LICENSE.txt for license information.
- */
-
-#ifndef __X86_64_ARITH_H__
-#define __X86_64_ARITH_H__
-
-#include <stdint.h>
-
-/* FUTURE: non x86-64 versions of these.
- * FUTURE: autogenerate
- */
-
-static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
-  #ifndef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rax;"
-       "mulq %[b];"
-       : [c]"=a"(c), [d]"=d"(d)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "cc");
-  return (((__uint128_t)(d))<<64) | c;
-  #else
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rdx;"
-       "mulx %[b], %[c], %[d];"
-       : [c]"=r"(c), [d]"=r"(d)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx");
-  return (((__uint128_t)(d))<<64) | c;
-  #endif
-}
-
-static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
-  #ifndef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rax;"
-       "mulq %[b];"
-       : [c]"=a"(c), [d]"=d"(d)
-       : [b]"m"(*b), [a]"r"(a)
-       : "cc");
-  return (((__uint128_t)(d))<<64) | c;
-  #else
-  uint64_t c,d;
-  __asm__ volatile
-      ("mulx %[b], %[c], %[d];"
-       : [c]"=r"(c), [d]"=r"(d)
-       : [b]"m"(*b), [a]"d"(a));
-  return (((__uint128_t)(d))<<64) | c;
-  #endif
-}
-
-static __inline__ __uint128_t widemul_rr(uint64_t a, uint64_t b) {
-  #ifndef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("mulq %[b];"
-       : [c]"=a"(c), [d]"=d"(d)
-       : [b]"r"(b), "a"(a)
-       : "cc");
-  return (((__uint128_t)(d))<<64) | c;
-  #else
-  uint64_t c,d;
-  __asm__ volatile
-      ("mulx %[b], %[c], %[d];"
-       : [c]"=r"(c), [d]"=r"(d)
-       : [b]"r"(b), [a]"d"(a));
-  return (((__uint128_t)(d))<<64) | c;
-  #endif
-}
-
-static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
-  #ifndef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rax; "
-       "addq %%rax, %%rax; "
-       "mulq %[b];"
-       : [c]"=a"(c), [d]"=d"(d)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "cc");
-  return (((__uint128_t)(d))<<64) | c;
-  #else
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rdx;"
-       "leaq (,%%rdx,2), %%rdx;"
-       "mulx %[b], %[c], %[d];"
-       : [c]"=r"(c), [d]"=r"(d)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx");
-  return (((__uint128_t)(d))<<64) | c;
-  #endif
-}
-
-static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  
-  #ifdef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rdx; "
-       "mulx %[b], %[c], %[d]; "
-       "addq %[c], %[lo]; "
-       "adcq %[d], %[hi]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx", "cc");
-  #else
-  __asm__ volatile
-      ("movq %[a], %%rax; "
-       "mulq %[b]; "
-       "addq %%rax, %[lo]; "
-       "adcq %%rdx, %[hi]; "
-       : [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rax", "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-}
-
-static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  uint64_t lo2 = *acc2, hi2 = *acc2>>64;
-  
-  #ifdef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rdx; "
-       "mulx %[b], %[c], %[d]; "
-       "addq %[c], %[lo]; "
-       "adcq %[d], %[hi]; "
-       "addq %[c], %[lo2]; "
-       "adcq %[d], %[hi2]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx", "cc");
-  #else
-  __asm__ volatile
-      ("movq %[a], %%rax; "
-       "mulq %[b]; "
-       "addq %%rax, %[lo]; "
-       "adcq %%rdx, %[hi]; "
-       "addq %%rax, %[lo2]; "
-       "adcq %%rdx, %[hi2]; "
-       : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rax", "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-  *acc2 = (((__uint128_t)(hi2))<<64) | lo2;
-}
-
-static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  
-  #ifdef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("mulx %[b], %[c], %[d]; "
-       "addq %[c], %[lo]; "
-       "adcq %[d], %[hi]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"d"(a)
-       : "cc");
-  #else
-  __asm__ volatile
-      ("movq %[a], %%rax; "
-       "mulq %[b]; "
-       "addq %%rax, %[lo]; "
-       "adcq %%rdx, %[hi]; "
-       : [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"r"(a)
-       : "rax", "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-}
-
-static __inline__ void mac_rr(__uint128_t *acc, uint64_t a, const uint64_t b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  
-  #ifdef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("mulx %[b], %[c], %[d]; "
-       "addq %[c], %[lo]; "
-       "adcq %[d], %[hi]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"r"(b), [a]"d"(a)
-       : "cc");
-  #else
-  __asm__ volatile
-      ("mulq %[b]; "
-       "addq %%rax, %[lo]; "
-       "adcq %%rdx, %[hi]; "
-       : [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"r"(b), "a"(a)
-       : "rax", "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-}
-
-static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  
-  #ifdef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rdx; "
-       "addq %%rdx, %%rdx; "
-       "mulx %[b], %[c], %[d]; "
-       "addq %[c], %[lo]; "
-       "adcq %[d], %[hi]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx", "cc");
-  #else
-  __asm__ volatile
-      ("movq %[a], %%rax; "
-       "addq %%rax, %%rax; "
-       "mulq %[b]; "
-       "addq %%rax, %[lo]; "
-       "adcq %%rdx, %[hi]; "
-       : [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rax", "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-}
-
-static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  #ifdef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rdx; "
-       "mulx %[b], %[c], %[d]; "
-       "subq %[c], %[lo]; "
-       "sbbq %[d], %[hi]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx", "cc");
-  #else
-  __asm__ volatile
-      ("movq %[a], %%rax; "
-       "mulq %[b]; "
-       "subq %%rax, %[lo]; "
-       "sbbq %%rdx, %[hi]; "
-       : [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rax", "rdx", "cc");
-  #endif
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-}
-
-static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  #ifdef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rdx; "
-       "addq %%rdx, %%rdx; "
-       "mulx %[b], %[c], %[d]; "
-       "subq %[c], %[lo]; "
-       "sbbq %[d], %[hi]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx", "cc");
-  #else
-  __asm__ volatile
-      ("movq %[a], %%rax; "
-       "addq %%rax, %%rax; "
-       "mulq %[b]; "
-       "subq %%rax, %[lo]; "
-       "sbbq %%rdx, %[hi]; "
-       : [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rax", "rdx", "cc");
-  #endif
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-  
-}
-
-static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
-  uint64_t c,d, lo = *acc, hi = *acc>>64;
-  __asm__ volatile
-      ("movq %[a], %%rdx; "
-       "mulx %[b], %[c], %[d]; "
-       "subq %[lo], %[c]; "
-       "sbbq %[hi], %[d]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx", "cc");
-  *acc = (((__uint128_t)(d))<<64) | c;
-}
-
-static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) {
-  return ((__uint128_t)(a)) * b;
-}
-
-static __inline__ __int128_t widemuls(int64_t a, int64_t b) {
-  return ((__int128_t)(a)) * b;
-}
- 
-static __inline__ uint64_t opacify(uint64_t x) {
-  __asm__ volatile("" : "+r"(x));
-  return x;
-}
-
-static __inline__ mask_t is_zero(uint64_t x) {
-  __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
-  return ~x;
-}
-
-#endif /* __X86_64_ARITH_H__ */
diff --git a/src/p480/arch_x86_64/f_impl.c b/src/p480/arch_x86_64/f_impl.c
index 7aea1f0..b3c565b 100644
--- a/src/p480/arch_x86_64/f_impl.c
+++ b/src/p480/arch_x86_64/f_impl.c
@@ -4,12 +4,7 @@
 
 #include "f_field.h"
 
-void
-gf_480_mul (
-    gf_480_t *__restrict__ cs,
-    const gf_480_t *as,
-    const gf_480_t *bs
-) {
+void gf_mul (gf *__restrict__ cs, const gf *as, const gf *bs) {
     const uint64_t *a = as->limb, *b = bs->limb;
     uint64_t *c = cs->limb;
 
@@ -144,12 +139,7 @@ gf_480_mul (
     c[0] += ((uint64_t)(accum1));
 }
 
-void
-gf_480_mulw (
-    gf_480_t *__restrict__ cs,
-    const gf_480_t *as,
-    uint64_t b
-) {
+void gf_mulw (gf *__restrict__ cs, const gf *as, uint64_t b) {
     const uint64_t *a = as->limb;
     uint64_t *c = cs->limb;
 
@@ -189,11 +179,7 @@ gf_480_mulw (
     c[1] += accum4 >> 60;
 }
 
-void
-gf_480_sqr (
-    gf_480_t *__restrict__ cs,
-    const gf_480_t *as
-) {
+void gf_sqr (gf *__restrict__ cs, const gf *as) {
     const uint64_t *a = as->limb;
     uint64_t *c = cs->limb;
 
@@ -304,10 +290,7 @@ gf_480_sqr (
     c[0] += ((uint64_t)(accum1));
 }
 
-void
-gf_480_strong_reduce (
-    gf_480_t *a
-) {
+void gf_strong_reduce (gf *a) {
     uint64_t mask = (1ull<<60)-1;
 
     /* first, clear high */
@@ -347,15 +330,11 @@ gf_480_strong_reduce (
     assert(is_zero(carry + scarry));
 }
 
-void
-gf_480_serialize (
-    uint8_t *serial,
-    const struct gf_480_t *x
-) {
+void gf_serialize (uint8_t *serial, const struct gf *x) {
     int i,j,k=0;
-    gf_480_t red;
-    gf_480_copy(&red, x);
-    gf_480_strong_reduce(&red);
+    gf red;
+    gf_copy(&red, x);
+    gf_strong_reduce(&red);
     word_t r = 0;
     for (i=0; i<8; i+=2) {
         r = red.limb[i];
@@ -373,11 +352,7 @@ gf_480_serialize (
     }
 }
 
-mask_t
-gf_480_deserialize (
-    gf_480_t *x,
-    const uint8_t serial[60]
-) {
+mask_t gf_deserialize (gf *x, const uint8_t serial[60]) {
     int i,j,k=0;
 
     for (i=0; i<8; i+=2) {
diff --git a/src/p480/arch_x86_64/x86-64-arith.h b/src/p480/arch_x86_64/x86-64-arith.h
deleted file mode 100644
index a4d40da..0000000
--- a/src/p480/arch_x86_64/x86-64-arith.h
+++ /dev/null
@@ -1,275 +0,0 @@
-/* Copyright (c) 2014 Cryptography Research, Inc.
- * Released under the MIT License.  See LICENSE.txt for license information.
- */
-
-#ifndef __X86_64_ARITH_H__
-#define __X86_64_ARITH_H__
-
-#include <stdint.h>
-
-static __inline__ __uint128_t widemul(const uint64_t *a, const uint64_t *b) {
-  #ifndef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rax;"
-       "mulq %[b];"
-       : [c]"=a"(c), [d]"=d"(d)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "cc");
-  return (((__uint128_t)(d))<<64) | c;
-  #else
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rdx;"
-       "mulx %[b], %[c], %[d];"
-       : [c]"=r"(c), [d]"=r"(d)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx");
-  return (((__uint128_t)(d))<<64) | c;
-  #endif
-}
-
-static __inline__ __uint128_t widemul_rm(uint64_t a, const uint64_t *b) {
-  #ifndef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rax;"
-       "mulq %[b];"
-       : [c]"=a"(c), [d]"=d"(d)
-       : [b]"m"(*b), [a]"r"(a)
-       : "cc");
-  return (((__uint128_t)(d))<<64) | c;
-  #else
-  uint64_t c,d;
-  __asm__ volatile
-      ("mulx %[b], %[c], %[d];"
-       : [c]"=r"(c), [d]"=r"(d)
-       : [b]"m"(*b), [a]"d"(a));
-  return (((__uint128_t)(d))<<64) | c;
-  #endif
-}
-
-static __inline__ __uint128_t widemul2(const uint64_t *a, const uint64_t *b) {
-  #ifndef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rax; "
-       "addq %%rax, %%rax; "
-       "mulq %[b];"
-       : [c]"=a"(c), [d]"=d"(d)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "cc");
-  return (((__uint128_t)(d))<<64) | c;
-  #else
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rdx;"
-       "leaq (,%%rdx,2), %%rdx;"
-       "mulx %[b], %[c], %[d];"
-       : [c]"=r"(c), [d]"=r"(d)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx");
-  return (((__uint128_t)(d))<<64) | c;
-  #endif
-}
-
-static __inline__ void mac(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  
-  #ifdef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rdx; "
-       "mulx %[b], %[c], %[d]; "
-       "addq %[c], %[lo]; "
-       "adcq %[d], %[hi]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx", "cc");
-  #else
-  __asm__ volatile
-      ("movq %[a], %%rax; "
-       "mulq %[b]; "
-       "addq %%rax, %[lo]; "
-       "adcq %%rdx, %[hi]; "
-       : [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rax", "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-}
-
-static __inline__ void macac(__uint128_t *acc, __uint128_t *acc2, const uint64_t *a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  uint64_t lo2 = *acc2, hi2 = *acc2>>64;
-  
-  #ifdef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rdx; "
-       "mulx %[b], %[c], %[d]; "
-       "addq %[c], %[lo]; "
-       "adcq %[d], %[hi]; "
-       "addq %[c], %[lo2]; "
-       "adcq %[d], %[hi2]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx", "cc");
-  #else
-  __asm__ volatile
-      ("movq %[a], %%rax; "
-       "mulq %[b]; "
-       "addq %%rax, %[lo]; "
-       "adcq %%rdx, %[hi]; "
-       "addq %%rax, %[lo2]; "
-       "adcq %%rdx, %[hi2]; "
-       : [lo]"+r"(lo), [hi]"+r"(hi), [lo2]"+r"(lo2), [hi2]"+r"(hi2)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rax", "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-  *acc2 = (((__uint128_t)(hi2))<<64) | lo2;
-}
-
-static __inline__ void mac_rm(__uint128_t *acc, uint64_t a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  
-  #ifdef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("mulx %[b], %[c], %[d]; "
-       "addq %[c], %[lo]; "
-       "adcq %[d], %[hi]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"d"(a)
-       : "cc");
-  #else
-  __asm__ volatile
-      ("movq %[a], %%rax; "
-       "mulq %[b]; "
-       "addq %%rax, %[lo]; "
-       "adcq %%rdx, %[hi]; "
-       : [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"r"(a)
-       : "rax", "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-}
-
-static __inline__ void mac2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  
-  #ifdef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rdx; "
-       "addq %%rdx, %%rdx; "
-       "mulx %[b], %[c], %[d]; "
-       "addq %[c], %[lo]; "
-       "adcq %[d], %[hi]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx", "cc");
-  #else
-  __asm__ volatile
-      ("movq %[a], %%rax; "
-       "addq %%rax, %%rax; "
-       "mulq %[b]; "
-       "addq %%rax, %[lo]; "
-       "adcq %%rdx, %[hi]; "
-       : [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rax", "rdx", "cc");
-  #endif
-  
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-}
-
-static __inline__ void msb(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  #ifdef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rdx; "
-       "mulx %[b], %[c], %[d]; "
-       "subq %[c], %[lo]; "
-       "sbbq %[d], %[hi]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx", "cc");
-  #else
-  __asm__ volatile
-      ("movq %[a], %%rax; "
-       "mulq %[b]; "
-       "subq %%rax, %[lo]; "
-       "sbbq %%rdx, %[hi]; "
-       : [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rax", "rdx", "cc");
-  #endif
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-}
-
-static __inline__ void msb2(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
-  uint64_t lo = *acc, hi = *acc>>64;
-  #ifdef __BMI2__
-  uint64_t c,d;
-  __asm__ volatile
-      ("movq %[a], %%rdx; "
-       "addq %%rdx, %%rdx; "
-       "mulx %[b], %[c], %[d]; "
-       "subq %[c], %[lo]; "
-       "sbbq %[d], %[hi]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx", "cc");
-  #else
-  __asm__ volatile
-      ("movq %[a], %%rax; "
-       "addq %%rax, %%rax; "
-       "mulq %[b]; "
-       "subq %%rax, %[lo]; "
-       "sbbq %%rdx, %[hi]; "
-       : [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rax", "rdx", "cc");
-  #endif
-  *acc = (((__uint128_t)(hi))<<64) | lo;
-  
-}
-
-static __inline__ void mrs(__uint128_t *acc, const uint64_t *a, const uint64_t *b) {
-  uint64_t c,d, lo = *acc, hi = *acc>>64;
-  __asm__ volatile
-      ("movq %[a], %%rdx; "
-       "mulx %[b], %[c], %[d]; "
-       "subq %[lo], %[c]; "
-       "sbbq %[hi], %[d]; "
-       : [c]"=r"(c), [d]"=r"(d), [lo]"+r"(lo), [hi]"+r"(hi)
-       : [b]"m"(*b), [a]"m"(*a)
-       : "rdx", "cc");
-  *acc = (((__uint128_t)(d))<<64) | c;
-}
-
-static __inline__ __uint128_t widemulu(uint64_t a, uint64_t b) {
-  return ((__uint128_t)(a)) * b;
-}
-
-static __inline__ __int128_t widemuls(int64_t a, int64_t b) {
-  return ((__int128_t)(a)) * b;
-}
- 
-static __inline__ uint64_t opacify(uint64_t x) {
-  __asm__ volatile("" : "+r"(x));
-  return x;
-}
-
-static __inline__ mask_t is_zero(uint64_t x) {
-  __asm__ volatile("neg %0; sbb %0, %0;" : "+r"(x));
-  return ~x;
-}
-
-#endif /* __X86_64_ARITH_H__ */
diff --git a/src/p521/arch_ref64/f_impl.c b/src/p521/arch_ref64/f_impl.c
index 8670cd6..03c98ee 100644
--- a/src/p521/arch_ref64/f_impl.c
+++ b/src/p521/arch_ref64/f_impl.c
@@ -16,12 +16,7 @@ static __inline__ uint64_t is_zero(uint64_t a) {
     return (((__uint128_t)a)-1)>>64;
 }
 
-void
-gf_521_mul (
-    gf_521_t *__restrict__ cs,
-    const gf_521_t *as,
-    const gf_521_t *bs
-) {
+void gf_mul (gf_s *__restrict__ cs, const gf as, const gf bs) {
     uint64_t *c = cs->limb;
     const uint64_t *a = as->limb, *b = bs->limb;
     __uint128_t accum0, accum1;
@@ -157,10 +152,9 @@ gf_521_mul (
     c[8] += accum1 >> 58;
 }
 
-void
-gf_521_mulw (
-    gf_521_t *__restrict__ cs,
-    const gf_521_t *as,
+void gf_mulw (
+    gf_s *__restrict__ cs,
+    const gf as,
     uint64_t b
 ) {
     const uint64_t *a = as->limb;
@@ -196,11 +190,7 @@ gf_521_mulw (
     c[1] += accum6 >> 58;
 }
 
-void
-gf_521_sqr (
-    gf_521_t *__restrict__ cs,
-    const gf_521_t *as
-) {
+void gf_sqr (gf_s *__restrict__ cs, const gf as) {
     uint64_t *c = cs->limb;
     const uint64_t *a = as->limb;
     __uint128_t accum0, accum1;
@@ -305,10 +295,7 @@ gf_521_sqr (
     c[8] += accum1 >> 58;
 }
 
-void
-gf_521_strong_reduce (
-    gf_521_t *a
-) {
+void gf_strong_reduce (gf a) {
     uint64_t mask = (1ull<<58)-1, mask2 = (1ull<<57)-1;
 
     /* first, clear high */
@@ -346,15 +333,11 @@ gf_521_strong_reduce (
     assert(is_zero(carry + scarry));
 }
 
-void
-gf_521_serialize (
-    uint8_t *serial,
-    const struct gf_521_t *x
-) {
+void gf_serialize (uint8_t *serial, const struct gf x) {
     int i,k=0;
-    gf_521_t red;
-    gf_521_copy(&red, x);
-    gf_521_strong_reduce(&red);
+    gf red;
+    gf_copy(&red, x);
+    gf_strong_reduce(&red);
     
     uint64_t r=0;
     int bits = 0;
@@ -370,11 +353,7 @@ gf_521_serialize (
     serial[k++] = r;
 }
 
-mask_t
-gf_521_deserialize (
-    gf_521_t *x,
-    const uint8_t serial[66]
-) {
+mask_t gf_deserialize (gf x, const uint8_t serial[66]) {
     int i,k=0,bits=0;
     __uint128_t out = 0;
     uint64_t mask = (1ull<<58)-1;
diff --git a/src/p521/arch_x86_64_r12/f_impl.c b/src/p521/arch_x86_64_r12/f_impl.c
index 0b42a4b..39d0f1e 100644
--- a/src/p521/arch_x86_64_r12/f_impl.c
+++ b/src/p521/arch_x86_64_r12/f_impl.c
@@ -167,12 +167,7 @@ static inline void hexad_sqr_signed (
 
 
 
-void
-gf_521_mul (
-    gf_521_t *__restrict__ cs,
-    const gf_521_t *as,
-    const gf_521_t *bs
-) {
+void gf_mul (gf *__restrict__ cs, const gf *as, const gf *bs) {
     int i;
     
 #if 0
@@ -253,13 +248,7 @@ gf_521_mul (
 }
 
 
-void
-gf_521_sqr (
-    gf_521_t *__restrict__ cs,
-    const gf_521_t *as
-) {
-    
-
+void gf_sqr (gf *__restrict__ cs, const gf *as) {
     int i;
 #if 0
     assert(as->limb[3] == 0 && as->limb[7] == 0 && as->limb[11] == 0);
@@ -312,15 +301,7 @@ gf_521_sqr (
     *(uint64x4_t *)&c[8] = out2;
 }
 
-void
-gf_521_mulw (
-    gf_521_t *__restrict__ cs,
-    const gf_521_t *as,
-    uint64_t b
-) {
-    
-    
-
+void gf_mulw (gf *__restrict__ cs, const gf *as, uint64_t b) {
 #if 0
     int i;
     assert(as->limb[3] == 0 && as->limb[7] == 0 && as->limb[11] == 0);
@@ -374,10 +355,7 @@ gf_521_mulw (
 }
 
 
-void
-gf_521_strong_reduce (
-    gf_521_t *a
-) {
+void gf_strong_reduce (gf *a) {
     uint64_t mask = (1ull<<58)-1, mask2 = (1ull<<57)-1;
 
     /* first, clear high */
@@ -417,15 +395,11 @@ gf_521_strong_reduce (
     a->limb[3] = a->limb[7] = a->limb[11] = 0;
 }
 
-void
-gf_521_serialize (
-    uint8_t *serial,
-    const struct gf_521_t *x
-) {
+void gf_serialize (uint8_t *serial, const struct gf *x) {
     unsigned int i,k=0;
-    gf_521_t red;
-    gf_521_copy(&red, x);
-    gf_521_strong_reduce(&red);
+    gf red;
+    gf_copy(&red, x);
+    gf_strong_reduce(&red);
     
     uint64_t r=0;
     int bits = 0;
@@ -441,11 +415,7 @@ gf_521_serialize (
     serial[k++] = r;
 }
 
-mask_t
-gf_521_deserialize (
-    gf_521_t *x,
-    const uint8_t serial[LIMBPERM(66)]
-) {
+mask_t gf_deserialize (gf *x, const uint8_t serial[LIMBPERM(66)]) {
     int i,k=0,bits=0;
     __uint128_t out = 0;
     uint64_t mask = (1ull<<58)-1;