diff --git a/Makefile b/Makefile
index d5145a7..67d5573 100644
--- a/Makefile
+++ b/Makefile
@@ -65,7 +65,7 @@ HEADERS= Makefile $(shell find . -name "*.h") build/timestamp
 
 LIBCOMPONENTS= build/goldilocks.o build/barrett_field.o build/crandom.o \
   build/$(FIELD).o build/ec_point.o build/scalarmul.o build/sha512.o build/magic.o \
-	build/f_arithmetic.o build/arithmetic.o
+	build/f_arithmetic.o build/arithmetic.o build/decaf.o
 
 TESTCOMPONENTS=build/test.o build/test_scalarmul.o build/test_sha512.o \
 	build/test_pointops.o build/test_arithmetic.o build/test_goldilocks.o build/magic.o
diff --git a/include/decaf.h b/include/decaf.h
new file mode 100644
index 0000000..d376123
--- /dev/null
+++ b/include/decaf.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2015 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+/**
+ * @file decaf.h
+ * @author Mike Hamburg
+ * @brief Decaf high-level functions.
+ */
+#ifndef __DECAF_H__
+#define __DECAF_H__ 1
+
+#include <stdint.h>
+
+typedef uint64_t decaf_word_t, decaf_bool_t;
+#define DECAF_LIMBS (512/8/sizeof(decaf_word_t))
+#define DECAF_SER_BYTES 56
+typedef struct decaf_point_s {
+    decaf_word_t x[DECAF_LIMBS],y[DECAF_LIMBS],z[DECAF_LIMBS],t[DECAF_LIMBS];
+} __attribute__((aligned(32))) decaf_point_t[1];
+
+static const decaf_bool_t DECAF_SUCCESS = -(decaf_bool_t)1, DECAF_FAILURE = 0;
+
+const decaf_point_t decaf_identity_point;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+    
+#define API_VIS __attribute__((visibility("default")))
+#define WARN_UNUSED __attribute__((warn_unused_result))
+#define NONNULL2 __attribute__((nonnull(1,2)))
+#define NONNULL3 __attribute__((nonnull(1,2,3)))
+
+void decaf_encode (
+    uint8_t ser[DECAF_SER_BYTES],
+    const decaf_point_t pt
+) API_VIS NONNULL2;
+    
+decaf_bool_t decaf_decode (
+    decaf_point_t pt,
+    const uint8_t ser[DECAF_SER_BYTES],
+    decaf_bool_t allow_identity
+) API_VIS WARN_UNUSED NONNULL2;
+    
+void decaf_add (
+    decaf_point_t a,
+    const decaf_point_t b,
+    const decaf_point_t c
+) API_VIS NONNULL3;
+    
+decaf_bool_t decaf_eq (
+    const decaf_point_t a,
+    const decaf_point_t b
+) API_VIS WARN_UNUSED NONNULL2;
+    
+void decaf_sub (
+    decaf_point_t a,
+    const decaf_point_t b,
+    const decaf_point_t c
+) API_VIS NONNULL3;
+    
+void decaf_add_sub (
+    decaf_point_t a,
+    const decaf_point_t b,
+    const decaf_point_t c,
+    decaf_bool_t do_sub
+) API_VIS NONNULL3;
+
+#ifdef __cplusplus
+}; /* extern "C" */
+#endif
+
+#endif /* __DECAF_H__ */
diff --git a/src/decaf.c b/src/decaf.c
new file mode 100644
index 0000000..39c0484
--- /dev/null
+++ b/src/decaf.c
@@ -0,0 +1,312 @@
+/* Copyright (c) 2015 Cryptography Research, Inc.
+ * Released under the MIT License.  See LICENSE.txt for license information.
+ */
+
+/**
+ * @file decaf.c
+ * @author Mike Hamburg
+ * @brief Decaf high-level functions.
+ */ 
+
+#include "decaf.h"
+
+typedef uint64_t word_t, mask_t; // TODO
+typedef __uint128_t dword_t;
+typedef __int128_t sdword_t;
+#define WBITS 64
+#define LBITS 56
+
+#define siv static inline void
+#define NLIMBS 8
+
+typedef word_t gf[NLIMBS];
+static const gf ZERO = {0}, ONE = {1}, TWO = {2};
+
+static const word_t LMASK = (1ull<<LBITS)-1;
+static const gf P = { LMASK, LMASK, LMASK, LMASK, LMASK-1, LMASK, LMASK, LMASK };
+#define FOR_LIMB(i,op) { unsigned int i=0; \
+   op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; op;i++; }
+
+static const int EDWARDS_D = -39081;
+
+siv gf_cpy(gf x, const gf y) { FOR_LIMB(i, x[i] = y[i]); }
+
+siv gf_mul_x (gf c, const gf a, const word_t *b, int limbs_b) {
+    gf aa;
+    gf_cpy(aa,a);
+    
+    dword_t accum[NLIMBS] = {0};
+    int i;
+    for (i=0; i<limbs_b; i++) {
+        FOR_LIMB(j,{ accum[(i+j)%NLIMBS] += (__uint128_t)b[i] * aa[j]; });
+        aa[(NLIMBS-1-i)^(NLIMBS/2)] += aa[NLIMBS-1-i];
+    }
+    
+    accum[NLIMBS-1] += accum[NLIMBS-2] >> LBITS;
+    accum[NLIMBS-2] &= LMASK;
+    accum[NLIMBS/2] += accum[NLIMBS-1] >> LBITS;
+    FOR_LIMB(j,{
+        accum[j] += accum[(j-1)%NLIMBS] >> LBITS;
+        accum[(j-1)%NLIMBS] &= LMASK;
+    });
+    FOR_LIMB(j, c[j] = accum[j] );
+}
+
+static void gf_mul( gf a, const gf b, const gf c ) { gf_mul_x(a,b,c,NLIMBS); }
+static void gf_sqr( gf a, const gf b ) { gf_mul_x(a,b,b,NLIMBS); }
+
+siv gf_sqrn ( gf x, const gf y, int n ) {
+    gf_cpy(x,y);
+    int i;
+    for (i=0; i<n; i++) gf_sqr(x,x);
+}
+
+static void ISR(gf a, const gf x) {
+    gf L0, L1, L2;
+    gf_sqr (L1,    x );
+    gf_mul (L2,    x,   L1 );
+    gf_sqr (L1,   L2 );
+    gf_mul (L2,    x,   L1 );
+    gf_sqrn(L1,   L2,    3 );
+    gf_mul (L0,   L2,   L1 );
+    gf_sqrn(L1,   L0,    3 );
+    gf_mul (L0,   L2,   L1 );
+    gf_sqrn(L2,   L0,    9 );
+    gf_mul (L1,   L0,   L2 );
+    gf_sqr (L0,   L1 );
+    gf_mul (L2,     x,  L0  );
+    gf_sqrn(L0,   L2,   18  );
+    gf_mul (L2,   L1,   L0  );
+    gf_sqrn(L0,   L2,   37  );
+    gf_mul (L1,   L2,   L0  );
+    gf_sqrn(L0,   L1,   37  );
+    gf_mul (L1,   L2,   L0  );
+    gf_sqrn(L0,   L1,   111 );
+    gf_mul (L2,   L1,   L0  );
+    gf_sqr (L0,   L2 );
+    gf_mul (L1,    x,   L0  );
+    gf_sqrn(L0,   L1,   223 );
+    gf_mul ( a,   L2,   L0  );
+}
+
+const decaf_point_t decaf_identity_point = {{{0},{1},{1},{0}}};
+
+siv gf_reduce(gf x) {
+    x[NLIMBS/2] += x[NLIMBS-1] >> LBITS;
+    FOR_LIMB(j,{
+        x[j] += x[(j-1)%NLIMBS] >> LBITS;
+        x[(j-1)%NLIMBS] &= LMASK;
+    });
+}
+
+siv gf_add ( gf x, const gf y, const gf z ) {
+    FOR_LIMB(i, x[i] = y[i] + z[i] );
+    gf_reduce(x);
+}
+
+siv gf_sub ( gf x, const gf y, const gf z ) {
+    FOR_LIMB(i, x[i] = y[i] - z[i] + 2*P[i] );
+    gf_reduce(x);
+}
+
+siv gf_mlw(gf a, const gf b, word_t w) {
+    if (w>0) {
+        gf_mul_x(a,b,&w,1);
+    } else {
+        word_t ww = -w;
+        gf_mul_x(a,b,&ww,1);
+        gf_sub(a,ZERO,a);
+    }
+}
+
+siv cond_neg(gf x, mask_t neg) {
+    gf y;
+    gf_sub(y,ZERO,x);
+    FOR_LIMB(i, x[i] = (x[i] & ~neg) | (y[i] & neg) );
+}
+
+siv cond_swap(gf x, gf y, mask_t swap) {
+    FOR_LIMB(i, {
+        word_t s = (x[i] ^ y[i]) & swap;
+        x[i] ^= s;
+        y[i] ^= s;
+    });
+}
+
+static void gf_canon ( gf a ) {
+    gf_reduce(a);
+
+    /* subtract p with borrow */
+    sdword_t carry = 0;
+    FOR_LIMB(i, {
+        carry = carry + a[i] - P[i];
+        a[i] = carry & LMASK;
+        carry >>= LBITS;
+    });
+    
+    mask_t addback = carry;
+    carry = 0;
+
+    /* add it back */
+    FOR_LIMB(i, {
+        carry = carry + a[i] + (P[i] & addback);
+        a[i] = carry & LMASK;
+        carry >>= LBITS;
+    });
+}
+
+static inline word_t gf_eq(const gf a, const gf b) {
+    gf c;
+    gf_sub(c,a,b);
+    gf_canon(c);
+    word_t ret=0;
+    FOR_LIMB(i, ret |= c[i] );
+    return ((dword_t)ret - 1) >> WBITS;
+}
+
+static inline word_t hibit(const gf x) {
+    gf y;
+    gf_add(y,x,x);
+    gf_canon(y);
+    return -(y[0]&1);
+}
+
+// FIXME: 32-bit cleanliness
+siv gf_ser ( uint8_t serial[56], const gf x ) {
+    int i,j;
+    gf red;
+    gf_cpy(red,x);
+    gf_canon(red);
+    for (i=0; i<8; i++) {
+        for (j=0; j<7; j++) {
+            serial[7*i+j] = red[i];
+            red[i] >>= 8;
+        }
+    }
+}
+
+// FIXME: 32-bit cleanliness
+static mask_t gf_deser ( gf x, const uint8_t serial[56] ) {
+    int i,j;
+    for (i=0; i<8; i++) {
+        uint64_t out = 0;
+        for (j=0; j<7; j++) {
+            out |= ((uint64_t)serial[7*i+j])<<(8*j);
+        }
+        x[i] = out;
+    }
+    
+    sdword_t accum = 0;
+    FOR_LIMB(i, accum = (accum + P[i] - x[i]) >> WBITS );
+    return ~accum;
+}
+
+siv
+add_sub_point (
+    decaf_point_t c,
+    const decaf_point_t d,
+    const decaf_point_t e,
+    mask_t sub
+) {
+    gf L0, L1, L2, L3;
+    gf_sub ( L1, d->y, d->x );
+    gf_sub ( L2, e->y, e->x );
+    gf_add ( L3, e->y, e->x );
+    cond_swap(L2,L3,sub);
+    gf_mul ( L0, L2, L1 );
+    gf_add ( L1, d->y, d->x );
+    gf_mul ( c->y, L3, L1 );
+    gf_mul ( L1, e->t, d->t );
+    gf_mlw ( c->x, L1, 2-2*EDWARDS_D );
+    gf_add ( L1, L0, c->y );
+    gf_sub ( L2, c->y, L0 );
+    gf_mul ( L0, d->z, e->z );
+    gf_add ( L0, L0, L0 );
+    gf_add ( c->y, L0, c->x );
+    gf_sub ( L0, L0, c->x );
+    cond_swap(L0,c->y,sub);
+    gf_mul ( c->z, L0, c->y );
+    gf_mul ( c->x, c->y, L2 );
+    gf_mul ( c->y, L0, L1 );
+    gf_mul ( c->t, L1, L2 );
+}
+    
+void decaf_encode( uint8_t ser[DECAF_SER_BYTES], const decaf_point_t a ) {
+    gf L0, L1, L2, L3;
+    gf_mlw ( L0, a->y, 1-EDWARDS_D ); 
+    gf_mul ( L2, L0, a->t ); 
+    gf_mul ( L0, a->x, a->z ); 
+    gf_sub ( L3, L2, L0 ); 
+    gf_add ( L0, a->z, a->y ); 
+    gf_sub ( L1, a->z, a->y ); 
+    gf_mul ( L2, L1, L0 );
+    gf_mlw ( L1, L2, -EDWARDS_D );
+    ISR ( L0, L1 );
+    gf_mlw ( L1, L0, -EDWARDS_D ); 
+    gf_mul ( L2, L1, L0 );
+    gf_mul ( L0, L2, L3 );
+    gf_add ( L3, L1, L1 );  
+    gf_mul ( L2, L3, a->z );   
+    cond_neg ( L1, ~hibit(L2) ); 
+    gf_mul ( L2, L1, a->y ); 
+    gf_add ( L0, L0, L2 );
+    cond_neg ( L0, hibit(L0) );
+    gf_ser(ser,L0);
+}
+    
+decaf_bool_t decaf_decode (
+    decaf_point_t a,
+    const uint8_t ser[DECAF_SER_BYTES],
+    decaf_bool_t allow_identity
+) {
+    gf s, L0, L1, L2, L3, L4;
+    mask_t zero = gf_eq(s, ZERO);
+    mask_t succ = gf_deser( s, ser );
+    succ &= allow_identity | ~zero;
+    succ &= ~hibit(s);
+    gf_sqr ( L0, s );
+    gf_sub ( a->z, ONE, L0 );
+    gf_sqr ( L1, a->z ); 
+    gf_mlw ( L2, L0, 4-4*EDWARDS_D );
+    gf_add ( L2, L2, L1 );
+    gf_mul ( L1, L2, L0 );
+    ISR ( L3, L1 );
+    gf_sqr ( L4, L3 );
+    gf_mul ( L0, L4, L1 );
+    gf_add ( L0, L0, ONE );
+    succ &= ~gf_eq ( L0, ZERO );
+    gf_mul ( L1, L2, L3 );
+    cond_neg ( L3, hibit(L1) );
+    gf_add ( a->x, s, s );
+    gf_mul ( L2, L3, s );
+    gf_sub ( L1, TWO, a->z );
+    gf_mul ( L0, L1, L2 );
+    gf_mul ( a->y,L0,a->z );
+    gf_mul ( a->t,a->x,L0 );
+    a->y[0] -= zero;
+    return succ;
+}
+    
+void decaf_add(decaf_point_t a, const decaf_point_t b, const decaf_point_t c) {
+    add_sub_point(a,b,c,0);
+}
+    
+void decaf_sub(decaf_point_t a, const decaf_point_t b, const decaf_point_t c) {
+    add_sub_point(a,b,c,-1);
+}
+    
+void decaf_add_sub (
+    decaf_point_t a,
+    const decaf_point_t b,
+    const decaf_point_t c,
+    decaf_bool_t do_sub
+) {
+    add_sub_point(a,b,c,do_sub);
+}
+
+decaf_bool_t decaf_eq ( const decaf_point_t a, const decaf_point_t b ) {
+    gf L0, L1;
+    gf_mul ( L0, b->y, a->x );
+    gf_mul ( L1, a->y, b->x );
+    return gf_eq(L0,L1);
+}
diff --git a/src/ec_point.c b/src/ec_point.c
index f9de136..9905cda 100644
--- a/src/ec_point.c
+++ b/src/ec_point.c
@@ -68,7 +68,8 @@ add_tw_extended (
 
 void
 add_sub_tw_extended (
-    tw_extended_a_t  d,
+    tw_extended_a_t c,
+    const tw_extended_a_t  d,
     const tw_extended_a_t e,
     mask_t sub
 ) {
@@ -79,20 +80,20 @@ add_sub_tw_extended (
     constant_time_cond_swap(L2,L3,sizeof(L2),sub);
     field_mul ( L0, L2, L1 );
     field_add ( L1, d->y, d->x );
-    field_mul ( d->y, L3, L1 );
+    field_mul ( c->y, L3, L1 );
     field_mul ( L1, e->t, d->t );
-    field_mulw_scc_wr ( d->x, L1, 2-2*EDWARDS_D );
+    field_mulw_scc_wr ( c->x, L1, 2-2*EDWARDS_D );
     field_add ( L1, L0, d->y );
-    field_sub ( L2, d->y, L0 );
+    field_sub ( L2, c->y, L0 );
     field_mul ( L0, d->z, e->z );
     field_add ( L0, L0, L0 );
-    field_add ( d->y, L0, d->x );
-    field_sub ( L0, L0, d->x );
-    constant_time_cond_swap(L0,d->y,sizeof(L0),sub);
-    field_mul ( d->z, L0, d->y );
-    field_mul ( d->x, d->y, L2 );
-    field_mul ( d->y, L0, L1 );
-    field_mul ( d->t, L1, L2 );
+    field_add ( c->y, L0, c->x );
+    field_sub ( L0, L0, c->x );
+    constant_time_cond_swap(L0,c->y,sizeof(L0),sub);
+    field_mul ( c->z, L0, c->y );
+    field_mul ( c->x, c->y, L2 );
+    field_mul ( c->y, L0, L1 );
+    field_mul ( c->t, L1, L2 );
 }
 
 void
diff --git a/src/include/ec_point.h b/src/include/ec_point.h
index db5ee7d..9ad22f7 100644
--- a/src/include/ec_point.h
+++ b/src/include/ec_point.h
@@ -307,7 +307,8 @@ add_tw_extended (
 
 void
 add_sub_tw_extended (
-    tw_extended_a_t  d,
+    tw_extended_a_t  c,
+    const tw_extended_a_t  d,
     const tw_extended_a_t e,
     mask_t sub
 );
diff --git a/src/scalarmul.c b/src/scalarmul.c
index f3ffd99..8bd8dd0 100644
--- a/src/scalarmul.c
+++ b/src/scalarmul.c
@@ -255,7 +255,7 @@ scalarmul_ed (
         bits ^= inv;
     
         constant_time_lookup_tw_extended(tmp, (const tw_extended_a_t*)multiples, NTABLE, bits & WINDOW_T_MASK);
-        add_sub_tw_extended(working, tmp, inv);
+        add_sub_tw_extended(working, working, tmp, inv);
     }
 }
 
diff --git a/test/test_pointops.c b/test/test_pointops.c
index 32f94de..12aaffd 100644
--- a/test/test_pointops.c
+++ b/test/test_pointops.c
@@ -3,6 +3,7 @@
 #include <stdio.h>
 
 #include "ec_point.h"
+#include "decaf.h"
 #include "scalarmul.h"
 #include "magic.h"
 #include "field.h"
@@ -156,12 +157,14 @@ add_double_test (
     copy_tw_extensible(&textb, &text1);
     add_tw_pniels_to_tw_extensible(&textb, &pn);
 
+    decaf_point_t ted3;
     convert_tw_extensible_to_tw_extended(&ted1, &text1);
     convert_tw_extensible_to_tw_extended(&ted2, &text2);
+    decaf_add(ted3, (struct decaf_point_s*)&ted1, (struct decaf_point_s*)&ted2);
     add_tw_extended(&ted1, &ted2);
     convert_tw_extensible_to_tw_extended(&ted2, &textb);
     
-    if (~decaf_eq_tw_extended(&ted1, &ted2)) {
+    if (~decaf_eq_tw_extended(&ted1, &ted2) | ~decaf_eq((struct decaf_point_s*)&ted1, ted3)) {
         youfail();
         succ = 0;
         printf("    Tw extended simple compat:\n");
@@ -173,6 +176,12 @@ add_double_test (
         field_print("    y2",ted2.y);
         field_print("    z2",ted2.z);
         field_print("    t2",ted2.t);
+        struct tw_extended_t *t3 = (struct tw_extended_t *)&ted3;
+        field_print("    x3",t3->x);
+        field_print("    y3",t3->y);
+        field_print("    z3",t3->z);
+        field_print("    t3",t3->t);
+        
     }
     
     succ &= fail_if_different_tw(&texta,&textb,"Addition commutativity","a+b","b+a");