diff options
-rw-r--r-- | .gitignore | 12 | ||||
-rw-r--r-- | LICENSE.md | 46 | ||||
-rw-r--r-- | Makefile | 56 | ||||
-rw-r--r-- | README | 40 | ||||
-rw-r--r-- | contrib/Curve25519Donna.c | 118 | ||||
-rw-r--r-- | contrib/Curve25519Donna.h | 53 | ||||
-rw-r--r-- | contrib/Curve25519Donna.java | 77 | ||||
-rw-r--r-- | contrib/make-snippets | 68 | ||||
-rw-r--r-- | curve25519-donna-c64.c | 449 | ||||
-rw-r--r-- | curve25519-donna.c | 860 | ||||
-rw-r--r-- | curve25519-donna.podspec | 13 | ||||
-rw-r--r-- | python-src/curve25519/__init__.py | 4 | ||||
-rw-r--r-- | python-src/curve25519/curve25519module.c | 105 | ||||
-rw-r--r-- | python-src/curve25519/keys.py | 46 | ||||
-rw-r--r-- | python-src/curve25519/test/__init__.py | 0 | ||||
-rwxr-xr-x | python-src/curve25519/test/test_curve25519.py | 99 | ||||
-rwxr-xr-x | python-src/curve25519/test/test_speed.py | 46 | ||||
-rwxr-xr-x | setup.py | 38 | ||||
-rw-r--r-- | speed-curve25519.c | 50 | ||||
-rw-r--r-- | test-curve25519.c | 54 | ||||
-rw-r--r-- | test-noncanon.c | 39 | ||||
-rw-r--r-- | test-sc-curve25519.c | 72 | ||||
-rw-r--r-- | test-sc-curve25519.s | 8 |
23 files changed, 2353 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ccabede --- /dev/null +++ b/.gitignore @@ -0,0 +1,12 @@ +/curve25519-donna-c64.a +/curve25519-donna.a +/test-curve25519-donna +/speed-curve25519-donna +/test-curve25519-donna-c64 +/speed-curve25519-donna-c64 +/test-sc-curve25519-donna-c64 +/build +*.o +*.pyc +/dist +/MANIFEST diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000..33a3240 --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,46 @@ +Copyright 2008, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. +* Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +curve25519-donna: Curve25519 elliptic curve, public key function + +http://code.google.com/p/curve25519-donna/ + +Adam Langley <agl@imperialviolet.org> + +Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to> + +More information about curve25519 can be found here + http://cr.yp.to/ecdh.html + +djb's sample implementation of curve25519 is written in a special assembly +language called qhasm and uses the floating point registers. + +This is, almost, a clean room reimplementation from the curve25519 paper. It +uses many of the tricks described therein. Only the crecip function is taken +from the sample implementation. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..e31fcca --- /dev/null +++ b/Makefile @@ -0,0 +1,56 @@ +CFLAGS=-Wmissing-prototypes -Wdeclaration-after-statement -O2 -Wall +CFLAGS_32=-m32 + +targets: curve25519-donna.a curve25519-donna-c64.a + +test: test-donna test-donna-c64 + +clean: + rm -f *.o *.a *.pp test-curve25519-donna test-curve25519-donna-c64 speed-curve25519-donna speed-curve25519-donna-c64 test-noncanon-curve25519-donna test-noncanon-curve25519-donna-c64 + +curve25519-donna.a: curve25519-donna.o + ar -rc curve25519-donna.a curve25519-donna.o + ranlib curve25519-donna.a + +curve25519-donna.o: curve25519-donna.c + gcc -c curve25519-donna.c $(CFLAGS) $(CFLAGS_32) + +curve25519-donna-c64.a: curve25519-donna-c64.o + ar -rc curve25519-donna-c64.a curve25519-donna-c64.o + ranlib curve25519-donna-c64.a + +curve25519-donna-c64.o: curve25519-donna-c64.c + gcc -c curve25519-donna-c64.c $(CFLAGS) + +test-donna: test-curve25519-donna + ./test-curve25519-donna | head -123456 | tail -1 + +test-donna-c64: test-curve25519-donna-c64 + ./test-curve25519-donna-c64 | head -123456 | tail -1 + +test-curve25519-donna: test-curve25519.c curve25519-donna.a + gcc -o test-curve25519-donna test-curve25519.c curve25519-donna.a $(CFLAGS) $(CFLAGS_32) + +test-curve25519-donna-c64: test-curve25519.c curve25519-donna-c64.a + gcc -o test-curve25519-donna-c64 test-curve25519.c curve25519-donna-c64.a $(CFLAGS) + +speed-curve25519-donna: speed-curve25519.c curve25519-donna.a + gcc -o speed-curve25519-donna speed-curve25519.c curve25519-donna.a $(CFLAGS) $(CFLAGS_32) + +speed-curve25519-donna-c64: speed-curve25519.c curve25519-donna-c64.a + gcc -o speed-curve25519-donna-c64 speed-curve25519.c curve25519-donna-c64.a $(CFLAGS) + +test-sc-curve25519-donna-c64: test-sc-curve25519.c curve25519-donna-c64.a + gcc -o test-sc-curve25519-donna-c64 -O test-sc-curve25519.c curve25519-donna-c64.a test-sc-curve25519.s $(CFLAGS) + +test-noncanon-donna: test-noncanon-curve25519-donna + ./test-noncanon-curve25519-donna + +test-noncanon-donna-c64: test-noncanon-curve25519-donna-c64 + ./test-noncanon-curve25519-donna-c64 + +test-noncanon-curve25519-donna: test-noncanon.c curve25519-donna.a + gcc -o test-noncanon-curve25519-donna test-noncanon.c curve25519-donna.a $(CFLAGS) $(CFLAGS_32) + +test-noncanon-curve25519-donna-c64: test-noncanon.c curve25519-donna-c64.a + gcc -o test-noncanon-curve25519-donna-c64 test-noncanon.c curve25519-donna-c64.a $(CFLAGS) @@ -0,0 +1,40 @@ +See http://code.google.com/p/curve25519-donna/ for details. + +BUILDING: + +If you run `make`, two .a archives will be built, similar to djb's curve25519 +code. Alternatively, read on: + +The C implementation is contained within curve25519-donna.c. It has no external +dependancies and is BSD licenced. You can copy/include/link it directly in with +your program. Recommended C flags: -O2 + +The x86-64 bit implementation is contained within curve25519-donna-x86-64.c and +curve25519-donna-x86-64.s. Build like this: + +% cpp curve25519-donna-x86-64.s > curve25519-donna-x86-64.s.pp +% as -o curve25519-donna-x86-64.s.o curve25519-donna-x86-64.s.pp +% gcc -O2 -c curve25519-donna-x86-64.c + +Then the two .o files can be linked in + +USAGE: + +The usage is exactly the same as djb's code (as described at +http://cr.yp.to/ecdh.html) expect that the function is called curve25519_donna. + +In short, + +To generate a private key just generate 32 random bytes. + +To generate the public key, just do: + + static const uint8_t basepoint[32] = {9}; + curve25519_donna(mypublic, mysecret, basepoint); + +To generate an agreed key do: + + uint8_t shared_key[32]; + curve25519_donna(shared_key, mysecret, theirpublic); + +And hash the shared_key with a cryptographic hash function before using. diff --git a/contrib/Curve25519Donna.c b/contrib/Curve25519Donna.c new file mode 100644 index 0000000..71b816c --- /dev/null +++ b/contrib/Curve25519Donna.c @@ -0,0 +1,118 @@ +/* + James Robson + Public domain. +*/ + +#include "Curve25519Donna.h" +#include <stdio.h> +#include <stdlib.h> + +extern void curve25519_donna(unsigned char *output, const unsigned char *a, + const unsigned char *b); + +unsigned char* +as_unsigned_char_array(JNIEnv* env, jbyteArray array, int* len); + +jbyteArray as_byte_array(JNIEnv* env, unsigned char* buf, int len); + + +jbyteArray as_byte_array(JNIEnv* env, unsigned char* buf, int len) { + jbyteArray array = (*env)->NewByteArray(env, len); + (*env)->SetByteArrayRegion(env, array, 0, len, (jbyte*)buf); + + //int i; + //for (i = 0;i < len;++i) printf("%02x",(unsigned int) buf[i]); printf(" "); + //printf("\n"); + + return array; +} + +unsigned char* +as_unsigned_char_array(JNIEnv* env, jbyteArray array, int* len) { + + *len = (*env)->GetArrayLength(env, array); + unsigned char* buf = (unsigned char*)calloc(*len+1, sizeof(char)); + (*env)->GetByteArrayRegion (env, array, 0, *len, (jbyte*)buf); + return buf; + +} + +JNIEXPORT jbyteArray JNICALL Java_Curve25519Donna_curve25519Donna + (JNIEnv *env, jobject obj, jbyteArray a, jbyteArray b) { + + unsigned char o[32] = {0}; + int l1, l2; + unsigned char* a1 = as_unsigned_char_array(env, a, &l1); + unsigned char* b1 = as_unsigned_char_array(env, b, &l2); + + if ( !(l1 == 32 && l2 == 32) ) { + fprintf(stderr, "Error, must be length 32"); + return NULL; + } + + + curve25519_donna(o, (const unsigned char*)a1, (const unsigned char*)b1); + + free(a1); + free(b1); + + return as_byte_array(env, (unsigned char*)o, 32); +} + +JNIEXPORT jbyteArray JNICALL Java_Curve25519Donna_makePrivate + (JNIEnv *env, jobject obj, jbyteArray secret) { + + int len; + unsigned char* k = as_unsigned_char_array(env, secret, &len); + + if (len != 32) { + fprintf(stderr, "Error, must be length 32"); + return NULL; + } + + k[0] &= 248; + k[31] &= 127; + k[31] |= 64; + return as_byte_array(env, k, 32); +} + +JNIEXPORT jbyteArray JNICALL Java_Curve25519Donna_getPublic + (JNIEnv *env, jobject obj, jbyteArray privkey) { + + int len; + unsigned char* private = as_unsigned_char_array(env, privkey, &len); + + if (len != 32) { + fprintf(stderr, "Error, must be length 32"); + return NULL; + } + + unsigned char pubkey[32]; + unsigned char basepoint[32] = {9}; + + curve25519_donna(pubkey, private, basepoint); + return as_byte_array(env, (unsigned char*)pubkey, 32); +} + +JNIEXPORT jbyteArray JNICALL Java_Curve25519Donna_makeSharedSecret + (JNIEnv *env, jobject obj, jbyteArray privkey, jbyteArray their_pubkey) { + + unsigned char shared_secret[32]; + + int l1, l2; + unsigned char* private = as_unsigned_char_array(env, privkey, &l1); + unsigned char* pubkey = as_unsigned_char_array(env, their_pubkey, &l2); + + if ( !(l1 == 32 && l2 == 32) ) { + fprintf(stderr, "Error, must be length 32"); + return NULL; + } + + curve25519_donna(shared_secret, private, pubkey); + return as_byte_array(env, (unsigned char*)shared_secret, 32); +} + +JNIEXPORT void JNICALL Java_Curve25519Donna_helowrld + (JNIEnv *env, jobject obj) { + printf("helowrld\n"); +} diff --git a/contrib/Curve25519Donna.h b/contrib/Curve25519Donna.h new file mode 100644 index 0000000..3cd4ca0 --- /dev/null +++ b/contrib/Curve25519Donna.h @@ -0,0 +1,53 @@ +/* DO NOT EDIT THIS FILE - it is machine generated */ +#include <jni.h> +/* Header for class Curve25519Donna */ + +#ifndef _Included_Curve25519Donna +#define _Included_Curve25519Donna +#ifdef __cplusplus +extern "C" { +#endif +/* + * Class: Curve25519Donna + * Method: curve25519Donna + * Signature: ([B[B)[B + */ +JNIEXPORT jbyteArray JNICALL Java_Curve25519Donna_curve25519Donna + (JNIEnv *, jobject, jbyteArray, jbyteArray); + +/* + * Class: Curve25519Donna + * Method: makePrivate + * Signature: ([B)[B + */ +JNIEXPORT jbyteArray JNICALL Java_Curve25519Donna_makePrivate + (JNIEnv *, jobject, jbyteArray); + +/* + * Class: Curve25519Donna + * Method: getPublic + * Signature: ([B)[B + */ +JNIEXPORT jbyteArray JNICALL Java_Curve25519Donna_getPublic + (JNIEnv *, jobject, jbyteArray); + +/* + * Class: Curve25519Donna + * Method: makeSharedSecret + * Signature: ([B[B)[B + */ +JNIEXPORT jbyteArray JNICALL Java_Curve25519Donna_makeSharedSecret + (JNIEnv *, jobject, jbyteArray, jbyteArray); + +/* + * Class: Curve25519Donna + * Method: helowrld + * Signature: ()V + */ +JNIEXPORT void JNICALL Java_Curve25519Donna_helowrld + (JNIEnv *, jobject); + +#ifdef __cplusplus +} +#endif +#endif diff --git a/contrib/Curve25519Donna.java b/contrib/Curve25519Donna.java new file mode 100644 index 0000000..e28cb53 --- /dev/null +++ b/contrib/Curve25519Donna.java @@ -0,0 +1,77 @@ +/* + James Robson + Public domain. +*/ + +public class Curve25519Donna { + + final protected static char[] hexArray = "0123456789ABCDEF".toCharArray(); + + public static String bytesToHex(byte[] bytes) { + char[] hexChars = new char[bytes.length * 2]; + int v; + for ( int j = 0; j < bytes.length; j++ ) { + v = bytes[j] & 0xFF; + hexChars[j * 2] = hexArray[v >>> 4]; + hexChars[j * 2 + 1] = hexArray[v & 0x0F]; + } + return new String(hexChars); + } + + public native byte[] curve25519Donna(byte[] a, byte[] b); + public native byte[] makePrivate(byte[] secret); + public native byte[] getPublic(byte[] privkey); + public native byte[] makeSharedSecret(byte[] privkey, byte[] theirPubKey); + public native void helowrld(); + + // Uncomment if your Java is 32-bit: + //static { System.loadLibrary("Curve25519Donna"); } + + // Otherwise, load this 64-bit .jnilib: + static { System.loadLibrary("Curve25519Donna_64"); } + + /* + To give the old tires a kick (OSX): + java -cp `pwd` Curve25519Donna + */ + public static void main (String[] args) { + + Curve25519Donna c = new Curve25519Donna(); + + // These should be 32 bytes long + byte[] user1Secret = "abcdefghijklmnopqrstuvwxyz123456".getBytes(); + byte[] user2Secret = "654321zyxwvutsrqponmlkjihgfedcba".getBytes(); + + + // You can use the curve function directly... + + //byte[] o = c.curve25519Donna(a, b); + //System.out.println("o = " + bytesToHex(o)); + + + // ... but it's not really necessary. Just use the following + // convenience methods: + + byte[] privKey = c.makePrivate(user1Secret); + byte[] pubKey = c.getPublic(privKey); + + byte[] privKey2 = c.makePrivate(user2Secret); + byte[] pubKey2 = c.getPublic(privKey2); + + System.out.println("'user1' privKey = " + bytesToHex(privKey)); + System.out.println("'user1' pubKey = " + bytesToHex(pubKey)); + System.out.println("==================================================="); + + System.out.println("'user2' privKey = " + bytesToHex(privKey2)); + System.out.println("'user2' pubKey = " + bytesToHex(pubKey2)); + System.out.println("==================================================="); + + + byte[] ss1 = c.makeSharedSecret(privKey, pubKey2); + System.out.println("'user1' computes shared secret: " + bytesToHex(ss1)); + + byte[] ss2 = c.makeSharedSecret(privKey2, pubKey); + System.out.println("'user2' computes shared secret: " + bytesToHex(ss2)); + + } +} diff --git a/contrib/make-snippets b/contrib/make-snippets new file mode 100644 index 0000000..4568721 --- /dev/null +++ b/contrib/make-snippets @@ -0,0 +1,68 @@ +CFLAGS=-Wmissing-prototypes -Wdeclaration-after-statement -O2 -Wall +CC=clang + + +targets: curve25519-donna.a curve25519-donna-c64.a + +test: test-donna test-donna-c64 + + +clean: + rm -f java-src/*.class java-src/*.jnilib *.dylib *.o *.a *.pp test-curve25519-donna test-curve25519-donna-c64 speed-curve25519-donna speed-curve25519-donna-c64 + +curve25519-donna.a: curve25519-donna.o + ar -rc curve25519-donna.a curve25519-donna.o + ranlib curve25519-donna.a + + +##### OSX dynamic library (32- & 64-bit) + +curve25519donna.dylib: curve25519-donna.a curve25519-donna-c64.a + $(CC) -m32 -fpic -shared -Wl,-all_load curve25519-donna.a -Wl,-all_load -o libcurve25519donna.dylib + $(CC) -fpic -shared -Wl,-all_load curve25519-donna-c64.a -Wl,-all_load -o libcurve25519donna_64.dylib + +##### OSX/Java section hence + +# Java JNI - compiled for OSX (32- & 64-bit) +Curve25519Donna.class: + cd java-src; javah -jni Curve25519Donna; cd .. + cd java-src; javac Curve25519Donna.java; cd .. + +Curve25519Donna.jnilib: curve25519-donna.a curve25519-donna-c64.a Curve25519Donna.class + @echo "Building 32-bit..." + clang -o java-src/libCurve25519Donna.jnilib $(CFLAGS) -lc -shared -m32 -I /System/Library/Frameworks/JavaVM.framework/Headers curve25519-donna.o java-src/Curve25519Donna.c + @echo "Building 64-bit..." + clang -o java-src/libCurve25519Donna_64.jnilib $(CFLAGS) -lc -shared -I /System/Library/Frameworks/JavaVM.framework/Headers curve25519-donna-c64.o java-src/Curve25519Donna.c + +##### OSX/Java section end + +curve25519-donna.o: curve25519-donna.c + $(CC) -c curve25519-donna.c $(CFLAGS) -m32 + +curve25519-donna-c64.a: curve25519-donna-c64.o + ar -rc curve25519-donna-c64.a curve25519-donna-c64.o + ranlib curve25519-donna-c64.a + +curve25519-donna-c64.o: curve25519-donna-c64.c + $(CC) -c curve25519-donna-c64.c $(CFLAGS) + +test-donna: test-curve25519-donna + ./test-curve25519-donna | head -123456 | tail -1 + +test-donna-c64: test-curve25519-donna-c64 + ./test-curve25519-donna-c64 | head -123456 | tail -1 + +test-curve25519-donna: test-curve25519.c curve25519-donna.a + $(CC) -o test-curve25519-donna test-curve25519.c curve25519-donna.a $(CFLAGS) -m32 + +test-curve25519-donna-c64: test-curve25519.c curve25519-donna-c64.a + $(CC) -o test-curve25519-donna-c64 test-curve25519.c curve25519-donna-c64.a $(CFLAGS) + +speed-curve25519-donna: speed-curve25519.c curve25519-donna.a + $(CC) -o speed-curve25519-donna speed-curve25519.c curve25519-donna.a $(CFLAGS) -m32 + +speed-curve25519-donna-c64: speed-curve25519.c curve25519-donna-c64.a + $(CC) -o speed-curve25519-donna-c64 speed-curve25519.c curve25519-donna-c64.a $(CFLAGS) + +test-sc-curve25519-donna-c64: test-sc-curve25519.c curve25519-donna-c64.a + $(CC) -o test-sc-curve25519-donna-c64 -O test-sc-curve25519.c curve25519-donna-c64.a test-sc-curve25519.s $(CFLAGS) diff --git a/curve25519-donna-c64.c b/curve25519-donna-c64.c new file mode 100644 index 0000000..9ebd8a1 --- /dev/null +++ b/curve25519-donna-c64.c @@ -0,0 +1,449 @@ +/* Copyright 2008, Google Inc. + * All rights reserved. + * + * Code released into the public domain. + * + * curve25519-donna: Curve25519 elliptic curve, public key function + * + * http://code.google.com/p/curve25519-donna/ + * + * Adam Langley <agl@imperialviolet.org> + * + * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to> + * + * More information about curve25519 can be found here + * http://cr.yp.to/ecdh.html + * + * djb's sample implementation of curve25519 is written in a special assembly + * language called qhasm and uses the floating point registers. + * + * This is, almost, a clean room reimplementation from the curve25519 paper. It + * uses many of the tricks described therein. Only the crecip function is taken + * from the sample implementation. + */ + +#include <string.h> +#include <stdint.h> + +typedef uint8_t u8; +typedef uint64_t limb; +typedef limb felem[5]; +// This is a special gcc mode for 128-bit integers. It's implemented on 64-bit +// platforms only as far as I know. +typedef unsigned uint128_t __attribute__((mode(TI))); + +#undef force_inline +#define force_inline __attribute__((always_inline)) + +/* Sum two numbers: output += in */ +static inline void force_inline +fsum(limb *output, const limb *in) { + output[0] += in[0]; + output[1] += in[1]; + output[2] += in[2]; + output[3] += in[3]; + output[4] += in[4]; +} + +/* Find the difference of two numbers: output = in - output + * (note the order of the arguments!) + * + * Assumes that out[i] < 2**52 + * On return, out[i] < 2**55 + */ +static inline void force_inline +fdifference_backwards(felem out, const felem in) { + /* 152 is 19 << 3 */ + static const limb two54m152 = (((limb)1) << 54) - 152; + static const limb two54m8 = (((limb)1) << 54) - 8; + + out[0] = in[0] + two54m152 - out[0]; + out[1] = in[1] + two54m8 - out[1]; + out[2] = in[2] + two54m8 - out[2]; + out[3] = in[3] + two54m8 - out[3]; + out[4] = in[4] + two54m8 - out[4]; +} + +/* Multiply a number by a scalar: output = in * scalar */ +static inline void force_inline +fscalar_product(felem output, const felem in, const limb scalar) { + uint128_t a; + + a = ((uint128_t) in[0]) * scalar; + output[0] = ((limb)a) & 0x7ffffffffffff; + + a = ((uint128_t) in[1]) * scalar + ((limb) (a >> 51)); + output[1] = ((limb)a) & 0x7ffffffffffff; + + a = ((uint128_t) in[2]) * scalar + ((limb) (a >> 51)); + output[2] = ((limb)a) & 0x7ffffffffffff; + + a = ((uint128_t) in[3]) * scalar + ((limb) (a >> 51)); + output[3] = ((limb)a) & 0x7ffffffffffff; + + a = ((uint128_t) in[4]) * scalar + ((limb) (a >> 51)); + output[4] = ((limb)a) & 0x7ffffffffffff; + + output[0] += (a >> 51) * 19; +} + +/* Multiply two numbers: output = in2 * in + * + * output must be distinct to both inputs. The inputs are reduced coefficient + * form, the output is not. + * + * Assumes that in[i] < 2**55 and likewise for in2. + * On return, output[i] < 2**52 + */ +static inline void force_inline +fmul(felem output, const felem in2, const felem in) { + uint128_t t[5]; + limb r0,r1,r2,r3,r4,s0,s1,s2,s3,s4,c; + + r0 = in[0]; + r1 = in[1]; + r2 = in[2]; + r3 = in[3]; + r4 = in[4]; + + s0 = in2[0]; + s1 = in2[1]; + s2 = in2[2]; + s3 = in2[3]; + s4 = in2[4]; + + t[0] = ((uint128_t) r0) * s0; + t[1] = ((uint128_t) r0) * s1 + ((uint128_t) r1) * s0; + t[2] = ((uint128_t) r0) * s2 + ((uint128_t) r2) * s0 + ((uint128_t) r1) * s1; + t[3] = ((uint128_t) r0) * s3 + ((uint128_t) r3) * s0 + ((uint128_t) r1) * s2 + ((uint128_t) r2) * s1; + t[4] = ((uint128_t) r0) * s4 + ((uint128_t) r4) * s0 + ((uint128_t) r3) * s1 + ((uint128_t) r1) * s3 + ((uint128_t) r2) * s2; + + r4 *= 19; + r1 *= 19; + r2 *= 19; + r3 *= 19; + + t[0] += ((uint128_t) r4) * s1 + ((uint128_t) r1) * s4 + ((uint128_t) r2) * s3 + ((uint128_t) r3) * s2; + t[1] += ((uint128_t) r4) * s2 + ((uint128_t) r2) * s4 + ((uint128_t) r3) * s3; + t[2] += ((uint128_t) r4) * s3 + ((uint128_t) r3) * s4; + t[3] += ((uint128_t) r4) * s4; + + r0 = (limb)t[0] & 0x7ffffffffffff; c = (limb)(t[0] >> 51); + t[1] += c; r1 = (limb)t[1] & 0x7ffffffffffff; c = (limb)(t[1] >> 51); + t[2] += c; r2 = (limb)t[2] & 0x7ffffffffffff; c = (limb)(t[2] >> 51); + t[3] += c; r3 = (limb)t[3] & 0x7ffffffffffff; c = (limb)(t[3] >> 51); + t[4] += c; r4 = (limb)t[4] & 0x7ffffffffffff; c = (limb)(t[4] >> 51); + r0 += c * 19; c = r0 >> 51; r0 = r0 & 0x7ffffffffffff; + r1 += c; c = r1 >> 51; r1 = r1 & 0x7ffffffffffff; + r2 += c; + + output[0] = r0; + output[1] = r1; + output[2] = r2; + output[3] = r3; + output[4] = r4; +} + +static inline void force_inline +fsquare_times(felem output, const felem in, limb count) { + uint128_t t[5]; + limb r0,r1,r2,r3,r4,c; + limb d0,d1,d2,d4,d419; + + r0 = in[0]; + r1 = in[1]; + r2 = in[2]; + r3 = in[3]; + r4 = in[4]; + + do { + d0 = r0 * 2; + d1 = r1 * 2; + d2 = r2 * 2 * 19; + d419 = r4 * 19; + d4 = d419 * 2; + + t[0] = ((uint128_t) r0) * r0 + ((uint128_t) d4) * r1 + (((uint128_t) d2) * (r3 )); + t[1] = ((uint128_t) d0) * r1 + ((uint128_t) d4) * r2 + (((uint128_t) r3) * (r3 * 19)); + t[2] = ((uint128_t) d0) * r2 + ((uint128_t) r1) * r1 + (((uint128_t) d4) * (r3 )); + t[3] = ((uint128_t) d0) * r3 + ((uint128_t) d1) * r2 + (((uint128_t) r4) * (d419 )); + t[4] = ((uint128_t) d0) * r4 + ((uint128_t) d1) * r3 + (((uint128_t) r2) * (r2 )); + + r0 = (limb)t[0] & 0x7ffffffffffff; c = (limb)(t[0] >> 51); + t[1] += c; r1 = (limb)t[1] & 0x7ffffffffffff; c = (limb)(t[1] >> 51); + t[2] += c; r2 = (limb)t[2] & 0x7ffffffffffff; c = (limb)(t[2] >> 51); + t[3] += c; r3 = (limb)t[3] & 0x7ffffffffffff; c = (limb)(t[3] >> 51); + t[4] += c; r4 = (limb)t[4] & 0x7ffffffffffff; c = (limb)(t[4] >> 51); + r0 += c * 19; c = r0 >> 51; r0 = r0 & 0x7ffffffffffff; + r1 += c; c = r1 >> 51; r1 = r1 & 0x7ffffffffffff; + r2 += c; + } while(--count); + + output[0] = r0; + output[1] = r1; + output[2] = r2; + output[3] = r3; + output[4] = r4; +} + +/* Load a little-endian 64-bit number */ +static limb +load_limb(const u8 *in) { + return + ((limb)in[0]) | + (((limb)in[1]) << 8) | + (((limb)in[2]) << 16) | + (((limb)in[3]) << 24) | + (((limb)in[4]) << 32) | + (((limb)in[5]) << 40) | + (((limb)in[6]) << 48) | + (((limb)in[7]) << 56); +} + +static void +store_limb(u8 *out, limb in) { + out[0] = in & 0xff; + out[1] = (in >> 8) & 0xff; + out[2] = (in >> 16) & 0xff; + out[3] = (in >> 24) & 0xff; + out[4] = (in >> 32) & 0xff; + out[5] = (in >> 40) & 0xff; + out[6] = (in >> 48) & 0xff; + out[7] = (in >> 56) & 0xff; +} + +/* Take a little-endian, 32-byte number and expand it into polynomial form */ +static void +fexpand(limb *output, const u8 *in) { + output[0] = load_limb(in) & 0x7ffffffffffff; + output[1] = (load_limb(in+6) >> 3) & 0x7ffffffffffff; + output[2] = (load_limb(in+12) >> 6) & 0x7ffffffffffff; + output[3] = (load_limb(in+19) >> 1) & 0x7ffffffffffff; + output[4] = (load_limb(in+24) >> 12) & 0x7ffffffffffff; +} + +/* Take a fully reduced polynomial form number and contract it into a + * little-endian, 32-byte array + */ +static void +fcontract(u8 *output, const felem input) { + uint128_t t[5]; + + t[0] = input[0]; + t[1] = input[1]; + t[2] = input[2]; + t[3] = input[3]; + t[4] = input[4]; + + t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff; + t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff; + t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff; + t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff; + t[0] += 19 * (t[4] >> 51); t[4] &= 0x7ffffffffffff; + + t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff; + t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff; + t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff; + t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff; + t[0] += 19 * (t[4] >> 51); t[4] &= 0x7ffffffffffff; + + /* now t is between 0 and 2^255-1, properly carried. */ + /* case 1: between 0 and 2^255-20. case 2: between 2^255-19 and 2^255-1. */ + + t[0] += 19; + + t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff; + t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff; + t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff; + t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff; + t[0] += 19 * (t[4] >> 51); t[4] &= 0x7ffffffffffff; + + /* now between 19 and 2^255-1 in both cases, and offset by 19. */ + + t[0] += 0x8000000000000 - 19; + t[1] += 0x8000000000000 - 1; + t[2] += 0x8000000000000 - 1; + t[3] += 0x8000000000000 - 1; + t[4] += 0x8000000000000 - 1; + + /* now between 2^255 and 2^256-20, and offset by 2^255. */ + + t[1] += t[0] >> 51; t[0] &= 0x7ffffffffffff; + t[2] += t[1] >> 51; t[1] &= 0x7ffffffffffff; + t[3] += t[2] >> 51; t[2] &= 0x7ffffffffffff; + t[4] += t[3] >> 51; t[3] &= 0x7ffffffffffff; + t[4] &= 0x7ffffffffffff; + + store_limb(output, t[0] | (t[1] << 51)); + store_limb(output+8, (t[1] >> 13) | (t[2] << 38)); + store_limb(output+16, (t[2] >> 26) | (t[3] << 25)); + store_limb(output+24, (t[3] >> 39) | (t[4] << 12)); +} + +/* Input: Q, Q', Q-Q' + * Output: 2Q, Q+Q' + * + * x2 z3: long form + * x3 z3: long form + * x z: short form, destroyed + * xprime zprime: short form, destroyed + * qmqp: short form, preserved + */ +static void +fmonty(limb *x2, limb *z2, /* output 2Q */ + limb *x3, limb *z3, /* output Q + Q' */ + limb *x, limb *z, /* input Q */ + limb *xprime, limb *zprime, /* input Q' */ + const limb *qmqp /* input Q - Q' */) { + limb origx[5], origxprime[5], zzz[5], xx[5], zz[5], xxprime[5], + zzprime[5], zzzprime[5]; + + memcpy(origx, x, 5 * sizeof(limb)); + fsum(x, z); + fdifference_backwards(z, origx); // does x - z + + memcpy(origxprime, xprime, sizeof(limb) * 5); + fsum(xprime, zprime); + fdifference_backwards(zprime, origxprime); + fmul(xxprime, xprime, z); + fmul(zzprime, x, zprime); + memcpy(origxprime, xxprime, sizeof(limb) * 5); + fsum(xxprime, zzprime); + fdifference_backwards(zzprime, origxprime); + fsquare_times(x3, xxprime, 1); + fsquare_times(zzzprime, zzprime, 1); + fmul(z3, zzzprime, qmqp); + + fsquare_times(xx, x, 1); + fsquare_times(zz, z, 1); + fmul(x2, xx, zz); + fdifference_backwards(zz, xx); // does zz = xx - zz + fscalar_product(zzz, zz, 121665); + fsum(zzz, xx); + fmul(z2, zz, zzz); +} + +// ----------------------------------------------------------------------------- +// Maybe swap the contents of two limb arrays (@a and @b), each @len elements +// long. Perform the swap iff @swap is non-zero. +// +// This function performs the swap without leaking any side-channel +// information. +// ----------------------------------------------------------------------------- +static void +swap_conditional(limb a[5], limb b[5], limb iswap) { + unsigned i; + const limb swap = -iswap; + + for (i = 0; i < 5; ++i) { + const limb x = swap & (a[i] ^ b[i]); + a[i] ^= x; + b[i] ^= x; + } +} + +/* Calculates nQ where Q is the x-coordinate of a point on the curve + * + * resultx/resultz: the x coordinate of the resulting curve point (short form) + * n: a little endian, 32-byte number + * q: a point of the curve (short form) + */ +static void +cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) { + limb a[5] = {0}, b[5] = {1}, c[5] = {1}, d[5] = {0}; + limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t; + limb e[5] = {0}, f[5] = {1}, g[5] = {0}, h[5] = {1}; + limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h; + + unsigned i, j; + + memcpy(nqpqx, q, sizeof(limb) * 5); + + for (i = 0; i < 32; ++i) { + u8 byte = n[31 - i]; + for (j = 0; j < 8; ++j) { + const limb bit = byte >> 7; + + swap_conditional(nqx, nqpqx, bit); + swap_conditional(nqz, nqpqz, bit); + fmonty(nqx2, nqz2, + nqpqx2, nqpqz2, + nqx, nqz, + nqpqx, nqpqz, + q); + swap_conditional(nqx2, nqpqx2, bit); + swap_conditional(nqz2, nqpqz2, bit); + + t = nqx; + nqx = nqx2; + nqx2 = t; + t = nqz; + nqz = nqz2; + nqz2 = t; + t = nqpqx; + nqpqx = nqpqx2; + nqpqx2 = t; + t = nqpqz; + nqpqz = nqpqz2; + nqpqz2 = t; + + byte <<= 1; + } + } + + memcpy(resultx, nqx, sizeof(limb) * 5); + memcpy(resultz, nqz, sizeof(limb) * 5); +} + + +// ----------------------------------------------------------------------------- +// Shamelessly copied from djb's code, tightened a little +// ----------------------------------------------------------------------------- +static void +crecip(felem out, const felem z) { + felem a,t0,b,c; + + /* 2 */ fsquare_times(a, z, 1); // a = 2 + /* 8 */ fsquare_times(t0, a, 2); + /* 9 */ fmul(b, t0, z); // b = 9 + /* 11 */ fmul(a, b, a); // a = 11 + /* 22 */ fsquare_times(t0, a, 1); + /* 2^5 - 2^0 = 31 */ fmul(b, t0, b); + /* 2^10 - 2^5 */ fsquare_times(t0, b, 5); + /* 2^10 - 2^0 */ fmul(b, t0, b); + /* 2^20 - 2^10 */ fsquare_times(t0, b, 10); + /* 2^20 - 2^0 */ fmul(c, t0, b); + /* 2^40 - 2^20 */ fsquare_times(t0, c, 20); + /* 2^40 - 2^0 */ fmul(t0, t0, c); + /* 2^50 - 2^10 */ fsquare_times(t0, t0, 10); + /* 2^50 - 2^0 */ fmul(b, t0, b); + /* 2^100 - 2^50 */ fsquare_times(t0, b, 50); + /* 2^100 - 2^0 */ fmul(c, t0, b); + /* 2^200 - 2^100 */ fsquare_times(t0, c, 100); + /* 2^200 - 2^0 */ fmul(t0, t0, c); + /* 2^250 - 2^50 */ fsquare_times(t0, t0, 50); + /* 2^250 - 2^0 */ fmul(t0, t0, b); + /* 2^255 - 2^5 */ fsquare_times(t0, t0, 5); + /* 2^255 - 21 */ fmul(out, t0, a); +} + +int curve25519_donna(u8 *, const u8 *, const u8 *); + +int +curve25519_donna(u8 *mypublic, const u8 *secret, const u8 *basepoint) { + limb bp[5], x[5], z[5], zmone[5]; + uint8_t e[32]; + int i; + + for (i = 0;i < 32;++i) e[i] = secret[i]; + e[0] &= 248; + e[31] &= 127; + e[31] |= 64; + + fexpand(bp, basepoint); + cmult(x, z, e, bp); + crecip(zmone, z); + fmul(z, x, zmone); + fcontract(mypublic, z); + return 0; +} diff --git a/curve25519-donna.c b/curve25519-donna.c new file mode 100644 index 0000000..ed15d6c --- /dev/null +++ b/curve25519-donna.c @@ -0,0 +1,860 @@ +/* Copyright 2008, Google Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Google Inc. nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * curve25519-donna: Curve25519 elliptic curve, public key function + * + * http://code.google.com/p/curve25519-donna/ + * + * Adam Langley <agl@imperialviolet.org> + * + * Derived from public domain C code by Daniel J. Bernstein <djb@cr.yp.to> + * + * More information about curve25519 can be found here + * http://cr.yp.to/ecdh.html + * + * djb's sample implementation of curve25519 is written in a special assembly + * language called qhasm and uses the floating point registers. + * + * This is, almost, a clean room reimplementation from the curve25519 paper. It + * uses many of the tricks described therein. Only the crecip function is taken + * from the sample implementation. */ + +#include <string.h> +#include <stdint.h> + +#ifdef _MSC_VER +#define inline __inline +#endif + +typedef uint8_t u8; +typedef int32_t s32; +typedef int64_t limb; + +/* Field element representation: + * + * Field elements are written as an array of signed, 64-bit limbs, least + * significant first. The value of the field element is: + * x[0] + 2^26·x[1] + x^51·x[2] + 2^102·x[3] + ... + * + * i.e. the limbs are 26, 25, 26, 25, ... bits wide. */ + +/* Sum two numbers: output += in */ +static void fsum(limb *output, const limb *in) { + unsigned i; + for (i = 0; i < 10; i += 2) { + output[0+i] = output[0+i] + in[0+i]; + output[1+i] = output[1+i] + in[1+i]; + } +} + +/* Find the difference of two numbers: output = in - output + * (note the order of the arguments!). */ +static void fdifference(limb *output, const limb *in) { + unsigned i; + for (i = 0; i < 10; ++i) { + output[i] = in[i] - output[i]; + } +} + +/* Multiply a number by a scalar: output = in * scalar */ +static void fscalar_product(limb *output, const limb *in, const limb scalar) { + unsigned i; + for (i = 0; i < 10; ++i) { + output[i] = in[i] * scalar; + } +} + +/* Multiply two numbers: output = in2 * in + * + * output must be distinct to both inputs. The inputs are reduced coefficient + * form, the output is not. + * + * output[x] <= 14 * the largest product of the input limbs. */ +static void fproduct(limb *output, const limb *in2, const limb *in) { + output[0] = ((limb) ((s32) in2[0])) * ((s32) in[0]); + output[1] = ((limb) ((s32) in2[0])) * ((s32) in[1]) + + ((limb) ((s32) in2[1])) * ((s32) in[0]); + output[2] = 2 * ((limb) ((s32) in2[1])) * ((s32) in[1]) + + ((limb) ((s32) in2[0])) * ((s32) in[2]) + + ((limb) ((s32) in2[2])) * ((s32) in[0]); + output[3] = ((limb) ((s32) in2[1])) * ((s32) in[2]) + + ((limb) ((s32) in2[2])) * ((s32) in[1]) + + ((limb) ((s32) in2[0])) * ((s32) in[3]) + + ((limb) ((s32) in2[3])) * ((s32) in[0]); + output[4] = ((limb) ((s32) in2[2])) * ((s32) in[2]) + + 2 * (((limb) ((s32) in2[1])) * ((s32) in[3]) + + ((limb) ((s32) in2[3])) * ((s32) in[1])) + + ((limb) ((s32) in2[0])) * ((s32) in[4]) + + ((limb) ((s32) in2[4])) * ((s32) in[0]); + output[5] = ((limb) ((s32) in2[2])) * ((s32) in[3]) + + ((limb) ((s32) in2[3])) * ((s32) in[2]) + + ((limb) ((s32) in2[1])) * ((s32) in[4]) + + ((limb) ((s32) in2[4])) * ((s32) in[1]) + + ((limb) ((s32) in2[0])) * ((s32) in[5]) + + ((limb) ((s32) in2[5])) * ((s32) in[0]); + output[6] = 2 * (((limb) ((s32) in2[3])) * ((s32) in[3]) + + ((limb) ((s32) in2[1])) * ((s32) in[5]) + + ((limb) ((s32) in2[5])) * ((s32) in[1])) + + ((limb) ((s32) in2[2])) * ((s32) in[4]) + + ((limb) ((s32) in2[4])) * ((s32) in[2]) + + ((limb) ((s32) in2[0])) * ((s32) in[6]) + + ((limb) ((s32) in2[6])) * ((s32) in[0]); + output[7] = ((limb) ((s32) in2[3])) * ((s32) in[4]) + + ((limb) ((s32) in2[4])) * ((s32) in[3]) + + ((limb) ((s32) in2[2])) * ((s32) in[5]) + + ((limb) ((s32) in2[5])) * ((s32) in[2]) + + ((limb) ((s32) in2[1])) * ((s32) in[6]) + + ((limb) ((s32) in2[6])) * ((s32) in[1]) + + ((limb) ((s32) in2[0])) * ((s32) in[7]) + + ((limb) ((s32) in2[7])) * ((s32) in[0]); + output[8] = ((limb) ((s32) in2[4])) * ((s32) in[4]) + + 2 * (((limb) ((s32) in2[3])) * ((s32) in[5]) + + ((limb) ((s32) in2[5])) * ((s32) in[3]) + + ((limb) ((s32) in2[1])) * ((s32) in[7]) + + ((limb) ((s32) in2[7])) * ((s32) in[1])) + + ((limb) ((s32) in2[2])) * ((s32) in[6]) + + ((limb) ((s32) in2[6])) * ((s32) in[2]) + + ((limb) ((s32) in2[0])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[0]); + output[9] = ((limb) ((s32) in2[4])) * ((s32) in[5]) + + ((limb) ((s32) in2[5])) * ((s32) in[4]) + + ((limb) ((s32) in2[3])) * ((s32) in[6]) + + ((limb) ((s32) in2[6])) * ((s32) in[3]) + + ((limb) ((s32) in2[2])) * ((s32) in[7]) + + ((limb) ((s32) in2[7])) * ((s32) in[2]) + + ((limb) ((s32) in2[1])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[1]) + + ((limb) ((s32) in2[0])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[0]); + output[10] = 2 * (((limb) ((s32) in2[5])) * ((s32) in[5]) + + ((limb) ((s32) in2[3])) * ((s32) in[7]) + + ((limb) ((s32) in2[7])) * ((s32) in[3]) + + ((limb) ((s32) in2[1])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[1])) + + ((limb) ((s32) in2[4])) * ((s32) in[6]) + + ((limb) ((s32) in2[6])) * ((s32) in[4]) + + ((limb) ((s32) in2[2])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[2]); + output[11] = ((limb) ((s32) in2[5])) * ((s32) in[6]) + + ((limb) ((s32) in2[6])) * ((s32) in[5]) + + ((limb) ((s32) in2[4])) * ((s32) in[7]) + + ((limb) ((s32) in2[7])) * ((s32) in[4]) + + ((limb) ((s32) in2[3])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[3]) + + ((limb) ((s32) in2[2])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[2]); + output[12] = ((limb) ((s32) in2[6])) * ((s32) in[6]) + + 2 * (((limb) ((s32) in2[5])) * ((s32) in[7]) + + ((limb) ((s32) in2[7])) * ((s32) in[5]) + + ((limb) ((s32) in2[3])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[3])) + + ((limb) ((s32) in2[4])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[4]); + output[13] = ((limb) ((s32) in2[6])) * ((s32) in[7]) + + ((limb) ((s32) in2[7])) * ((s32) in[6]) + + ((limb) ((s32) in2[5])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[5]) + + ((limb) ((s32) in2[4])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[4]); + output[14] = 2 * (((limb) ((s32) in2[7])) * ((s32) in[7]) + + ((limb) ((s32) in2[5])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[5])) + + ((limb) ((s32) in2[6])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[6]); + output[15] = ((limb) ((s32) in2[7])) * ((s32) in[8]) + + ((limb) ((s32) in2[8])) * ((s32) in[7]) + + ((limb) ((s32) in2[6])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[6]); + output[16] = ((limb) ((s32) in2[8])) * ((s32) in[8]) + + 2 * (((limb) ((s32) in2[7])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[7])); + output[17] = ((limb) ((s32) in2[8])) * ((s32) in[9]) + + ((limb) ((s32) in2[9])) * ((s32) in[8]); + output[18] = 2 * ((limb) ((s32) in2[9])) * ((s32) in[9]); +} + +/* Reduce a long form to a short form by taking the input mod 2^255 - 19. + * + * On entry: |output[i]| < 14*2^54 + * On exit: |output[0..8]| < 280*2^54 */ +static void freduce_degree(limb *output) { + /* Each of these shifts and adds ends up multiplying the value by 19. + * + * For output[0..8], the absolute entry value is < 14*2^54 and we add, at + * most, 19*14*2^54 thus, on exit, |output[0..8]| < 280*2^54. */ + output[8] += output[18] << 4; + output[8] += output[18] << 1; + output[8] += output[18]; + output[7] += output[17] << 4; + output[7] += output[17] << 1; + output[7] += output[17]; + output[6] += output[16] << 4; + output[6] += output[16] << 1; + output[6] += output[16]; + output[5] += output[15] << 4; + output[5] += output[15] << 1; + output[5] += output[15]; + output[4] += output[14] << 4; + output[4] += output[14] << 1; + output[4] += output[14]; + output[3] += output[13] << 4; + output[3] += output[13] << 1; + output[3] += output[13]; + output[2] += output[12] << 4; + output[2] += output[12] << 1; + output[2] += output[12]; + output[1] += output[11] << 4; + output[1] += output[11] << 1; + output[1] += output[11]; + output[0] += output[10] << 4; + output[0] += output[10] << 1; + output[0] += output[10]; +} + +#if (-1 & 3) != 3 +#error "This code only works on a two's complement system" +#endif + +/* return v / 2^26, using only shifts and adds. + * + * On entry: v can take any value. */ +static inline limb +div_by_2_26(const limb v) +{ + /* High word of v; no shift needed. */ + const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32); + /* Set to all 1s if v was negative; else set to 0s. */ + const int32_t sign = ((int32_t) highword) >> 31; + /* Set to 0x3ffffff if v was negative; else set to 0. */ + const int32_t roundoff = ((uint32_t) sign) >> 6; + /* Should return v / (1<<26) */ + return (v + roundoff) >> 26; +} + +/* return v / (2^25), using only shifts and adds. + * + * On entry: v can take any value. */ +static inline limb +div_by_2_25(const limb v) +{ + /* High word of v; no shift needed*/ + const uint32_t highword = (uint32_t) (((uint64_t) v) >> 32); + /* Set to all 1s if v was negative; else set to 0s. */ + const int32_t sign = ((int32_t) highword) >> 31; + /* Set to 0x1ffffff if v was negative; else set to 0. */ + const int32_t roundoff = ((uint32_t) sign) >> 7; + /* Should return v / (1<<25) */ + return (v + roundoff) >> 25; +} + +/* Reduce all coefficients of the short form input so that |x| < 2^26. + * + * On entry: |output[i]| < 280*2^54 */ +static void freduce_coefficients(limb *output) { + unsigned i; + + output[10] = 0; + + for (i = 0; i < 10; i += 2) { + limb over = div_by_2_26(output[i]); + /* The entry condition (that |output[i]| < 280*2^54) means that over is, at + * most, 280*2^28 in the first iteration of this loop. This is added to the + * next limb and we can approximate the resulting bound of that limb by + * 281*2^54. */ + output[i] -= over << 26; + output[i+1] += over; + + /* For the first iteration, |output[i+1]| < 281*2^54, thus |over| < + * 281*2^29. When this is added to the next limb, the resulting bound can + * be approximated as 281*2^54. + * + * For subsequent iterations of the loop, 281*2^54 remains a conservative + * bound and no overflow occurs. */ + over = div_by_2_25(output[i+1]); + output[i+1] -= over << 25; + output[i+2] += over; + } + /* Now |output[10]| < 281*2^29 and all other coefficients are reduced. */ + output[0] += output[10] << 4; + output[0] += output[10] << 1; + output[0] += output[10]; + + output[10] = 0; + + /* Now output[1..9] are reduced, and |output[0]| < 2^26 + 19*281*2^29 + * So |over| will be no more than 2^16. */ + { + limb over = div_by_2_26(output[0]); + output[0] -= over << 26; + output[1] += over; + } + + /* Now output[0,2..9] are reduced, and |output[1]| < 2^25 + 2^16 < 2^26. The + * bound on |output[1]| is sufficient to meet our needs. */ +} + +/* A helpful wrapper around fproduct: output = in * in2. + * + * On entry: |in[i]| < 2^27 and |in2[i]| < 2^27. + * + * output must be distinct to both inputs. The output is reduced degree + * (indeed, one need only provide storage for 10 limbs) and |output[i]| < 2^26. */ +static void +fmul(limb *output, const limb *in, const limb *in2) { + limb t[19]; + fproduct(t, in, in2); + /* |t[i]| < 14*2^54 */ + freduce_degree(t); + freduce_coefficients(t); + /* |t[i]| < 2^26 */ + memcpy(output, t, sizeof(limb) * 10); +} + +/* Square a number: output = in**2 + * + * output must be distinct from the input. The inputs are reduced coefficient + * form, the output is not. + * + * output[x] <= 14 * the largest product of the input limbs. */ +static void fsquare_inner(limb *output, const limb *in) { + output[0] = ((limb) ((s32) in[0])) * ((s32) in[0]); + output[1] = 2 * ((limb) ((s32) in[0])) * ((s32) in[1]); + output[2] = 2 * (((limb) ((s32) in[1])) * ((s32) in[1]) + + ((limb) ((s32) in[0])) * ((s32) in[2])); + output[3] = 2 * (((limb) ((s32) in[1])) * ((s32) in[2]) + + ((limb) ((s32) in[0])) * ((s32) in[3])); + output[4] = ((limb) ((s32) in[2])) * ((s32) in[2]) + + 4 * ((limb) ((s32) in[1])) * ((s32) in[3]) + + 2 * ((limb) ((s32) in[0])) * ((s32) in[4]); + output[5] = 2 * (((limb) ((s32) in[2])) * ((s32) in[3]) + + ((limb) ((s32) in[1])) * ((s32) in[4]) + + ((limb) ((s32) in[0])) * ((s32) in[5])); + output[6] = 2 * (((limb) ((s32) in[3])) * ((s32) in[3]) + + ((limb) ((s32) in[2])) * ((s32) in[4]) + + ((limb) ((s32) in[0])) * ((s32) in[6]) + + 2 * ((limb) ((s32) in[1])) * ((s32) in[5])); + output[7] = 2 * (((limb) ((s32) in[3])) * ((s32) in[4]) + + ((limb) ((s32) in[2])) * ((s32) in[5]) + + ((limb) ((s32) in[1])) * ((s32) in[6]) + + ((limb) ((s32) in[0])) * ((s32) in[7])); + output[8] = ((limb) ((s32) in[4])) * ((s32) in[4]) + + 2 * (((limb) ((s32) in[2])) * ((s32) in[6]) + + ((limb) ((s32) in[0])) * ((s32) in[8]) + + 2 * (((limb) ((s32) in[1])) * ((s32) in[7]) + + ((limb) ((s32) in[3])) * ((s32) in[5]))); + output[9] = 2 * (((limb) ((s32) in[4])) * ((s32) in[5]) + + ((limb) ((s32) in[3])) * ((s32) in[6]) + + ((limb) ((s32) in[2])) * ((s32) in[7]) + + ((limb) ((s32) in[1])) * ((s32) in[8]) + + ((limb) ((s32) in[0])) * ((s32) in[9])); + output[10] = 2 * (((limb) ((s32) in[5])) * ((s32) in[5]) + + ((limb) ((s32) in[4])) * ((s32) in[6]) + + ((limb) ((s32) in[2])) * ((s32) in[8]) + + 2 * (((limb) ((s32) in[3])) * ((s32) in[7]) + + ((limb) ((s32) in[1])) * ((s32) in[9]))); + output[11] = 2 * (((limb) ((s32) in[5])) * ((s32) in[6]) + + ((limb) ((s32) in[4])) * ((s32) in[7]) + + ((limb) ((s32) in[3])) * ((s32) in[8]) + + ((limb) ((s32) in[2])) * ((s32) in[9])); + output[12] = ((limb) ((s32) in[6])) * ((s32) in[6]) + + 2 * (((limb) ((s32) in[4])) * ((s32) in[8]) + + 2 * (((limb) ((s32) in[5])) * ((s32) in[7]) + + ((limb) ((s32) in[3])) * ((s32) in[9]))); + output[13] = 2 * (((limb) ((s32) in[6])) * ((s32) in[7]) + + ((limb) ((s32) in[5])) * ((s32) in[8]) + + ((limb) ((s32) in[4])) * ((s32) in[9])); + output[14] = 2 * (((limb) ((s32) in[7])) * ((s32) in[7]) + + ((limb) ((s32) in[6])) * ((s32) in[8]) + + 2 * ((limb) ((s32) in[5])) * ((s32) in[9])); + output[15] = 2 * (((limb) ((s32) in[7])) * ((s32) in[8]) + + ((limb) ((s32) in[6])) * ((s32) in[9])); + output[16] = ((limb) ((s32) in[8])) * ((s32) in[8]) + + 4 * ((limb) ((s32) in[7])) * ((s32) in[9]); + output[17] = 2 * ((limb) ((s32) in[8])) * ((s32) in[9]); + output[18] = 2 * ((limb) ((s32) in[9])) * ((s32) in[9]); +} + +/* fsquare sets output = in^2. + * + * On entry: The |in| argument is in reduced coefficients form and |in[i]| < + * 2^27. + * + * On exit: The |output| argument is in reduced coefficients form (indeed, one + * need only provide storage for 10 limbs) and |out[i]| < 2^26. */ +static void +fsquare(limb *output, const limb *in) { + limb t[19]; + fsquare_inner(t, in); + /* |t[i]| < 14*2^54 because the largest product of two limbs will be < + * 2^(27+27) and fsquare_inner adds together, at most, 14 of those + * products. */ + freduce_degree(t); + freduce_coefficients(t); + /* |t[i]| < 2^26 */ + memcpy(output, t, sizeof(limb) * 10); +} + +/* Take a little-endian, 32-byte number and expand it into polynomial form */ +static void +fexpand(limb *output, const u8 *input) { +#define F(n,start,shift,mask) \ + output[n] = ((((limb) input[start + 0]) | \ + ((limb) input[start + 1]) << 8 | \ + ((limb) input[start + 2]) << 16 | \ + ((limb) input[start + 3]) << 24) >> shift) & mask; + F(0, 0, 0, 0x3ffffff); + F(1, 3, 2, 0x1ffffff); + F(2, 6, 3, 0x3ffffff); + F(3, 9, 5, 0x1ffffff); + F(4, 12, 6, 0x3ffffff); + F(5, 16, 0, 0x1ffffff); + F(6, 19, 1, 0x3ffffff); + F(7, 22, 3, 0x1ffffff); + F(8, 25, 4, 0x3ffffff); + F(9, 28, 6, 0x1ffffff); +#undef F +} + +#if (-32 >> 1) != -16 +#error "This code only works when >> does sign-extension on negative numbers" +#endif + +/* s32_eq returns 0xffffffff iff a == b and zero otherwise. */ +static s32 s32_eq(s32 a, s32 b) { + a = ~(a ^ b); + a &= a << 16; + a &= a << 8; + a &= a << 4; + a &= a << 2; + a &= a << 1; + return a >> 31; +} + +/* s32_gte returns 0xffffffff if a >= b and zero otherwise, where a and b are + * both non-negative. */ +static s32 s32_gte(s32 a, s32 b) { + a -= b; + /* a >= 0 iff a >= b. */ + return ~(a >> 31); +} + +/* Take a fully reduced polynomial form number and contract it into a + * little-endian, 32-byte array. + * + * On entry: |input_limbs[i]| < 2^26 */ +static void +fcontract(u8 *output, limb *input_limbs) { + int i; + int j; + s32 input[10]; + s32 mask; + + /* |input_limbs[i]| < 2^26, so it's valid to convert to an s32. */ + for (i = 0; i < 10; i++) { + input[i] = input_limbs[i]; + } + + for (j = 0; j < 2; ++j) { + for (i = 0; i < 9; ++i) { + if ((i & 1) == 1) { + /* This calculation is a time-invariant way to make input[i] + * non-negative by borrowing from the next-larger limb. */ + const s32 mask = input[i] >> 31; + const s32 carry = -((input[i] & mask) >> 25); + input[i] = input[i] + (carry << 25); + input[i+1] = input[i+1] - carry; + } else { + const s32 mask = input[i] >> 31; + const s32 carry = -((input[i] & mask) >> 26); + input[i] = input[i] + (carry << 26); + input[i+1] = input[i+1] - carry; + } + } + + /* There's no greater limb for input[9] to borrow from, but we can multiply + * by 19 and borrow from input[0], which is valid mod 2^255-19. */ + { + const s32 mask = input[9] >> 31; + const s32 carry = -((input[9] & mask) >> 25); + input[9] = input[9] + (carry << 25); + input[0] = input[0] - (carry * 19); + } + + /* After the first iteration, input[1..9] are non-negative and fit within + * 25 or 26 bits, depending on position. However, input[0] may be + * negative. */ + } + + /* The first borrow-propagation pass above ended with every limb + except (possibly) input[0] non-negative. + + If input[0] was negative after the first pass, then it was because of a + carry from input[9]. On entry, input[9] < 2^26 so the carry was, at most, + one, since (2**26-1) >> 25 = 1. Thus input[0] >= -19. + + In the second pass, each limb is decreased by at most one. Thus the second + borrow-propagation pass could only have wrapped around to decrease + input[0] again if the first pass left input[0] negative *and* input[1] + through input[9] were all zero. In that case, input[1] is now 2^25 - 1, + and this last borrow-propagation step will leave input[1] non-negative. */ + { + const s32 mask = input[0] >> 31; + const s32 carry = -((input[0] & mask) >> 26); + input[0] = input[0] + (carry << 26); + input[1] = input[1] - carry; + } + + /* All input[i] are now non-negative. However, there might be values between + * 2^25 and 2^26 in a limb which is, nominally, 25 bits wide. */ + for (j = 0; j < 2; j++) { + for (i = 0; i < 9; i++) { + if ((i & 1) == 1) { + const s32 carry = input[i] >> 25; + input[i] &= 0x1ffffff; + input[i+1] += carry; + } else { + const s32 carry = input[i] >> 26; + input[i] &= 0x3ffffff; + input[i+1] += carry; + } + } + + { + const s32 carry = input[9] >> 25; + input[9] &= 0x1ffffff; + input[0] += 19*carry; + } + } + + /* If the first carry-chain pass, just above, ended up with a carry from + * input[9], and that caused input[0] to be out-of-bounds, then input[0] was + * < 2^26 + 2*19, because the carry was, at most, two. + * + * If the second pass carried from input[9] again then input[0] is < 2*19 and + * the input[9] -> input[0] carry didn't push input[0] out of bounds. */ + + /* It still remains the case that input might be between 2^255-19 and 2^255. + * In this case, input[1..9] must take their maximum value and input[0] must + * be >= (2^255-19) & 0x3ffffff, which is 0x3ffffed. */ + mask = s32_gte(input[0], 0x3ffffed); + for (i = 1; i < 10; i++) { + if ((i & 1) == 1) { + mask &= s32_eq(input[i], 0x1ffffff); + } else { + mask &= s32_eq(input[i], 0x3ffffff); + } + } + + /* mask is either 0xffffffff (if input >= 2^255-19) and zero otherwise. Thus + * this conditionally subtracts 2^255-19. */ + input[0] -= mask & 0x3ffffed; + + for (i = 1; i < 10; i++) { + if ((i & 1) == 1) { + input[i] -= mask & 0x1ffffff; + } else { + input[i] -= mask & 0x3ffffff; + } + } + + input[1] <<= 2; + input[2] <<= 3; + input[3] <<= 5; + input[4] <<= 6; + input[6] <<= 1; + input[7] <<= 3; + input[8] <<= 4; + input[9] <<= 6; +#define F(i, s) \ + output[s+0] |= input[i] & 0xff; \ + output[s+1] = (input[i] >> 8) & 0xff; \ + output[s+2] = (input[i] >> 16) & 0xff; \ + output[s+3] = (input[i] >> 24) & 0xff; + output[0] = 0; + output[16] = 0; + F(0,0); + F(1,3); + F(2,6); + F(3,9); + F(4,12); + F(5,16); + F(6,19); + F(7,22); + F(8,25); + F(9,28); +#undef F +} + +/* Input: Q, Q', Q-Q' + * Output: 2Q, Q+Q' + * + * x2 z3: long form + * x3 z3: long form + * x z: short form, destroyed + * xprime zprime: short form, destroyed + * qmqp: short form, preserved + * + * On entry and exit, the absolute value of the limbs of all inputs and outputs + * are < 2^26. */ +static void fmonty(limb *x2, limb *z2, /* output 2Q */ + limb *x3, limb *z3, /* output Q + Q' */ + limb *x, limb *z, /* input Q */ + limb *xprime, limb *zprime, /* input Q' */ + const limb *qmqp /* input Q - Q' */) { + limb origx[10], origxprime[10], zzz[19], xx[19], zz[19], xxprime[19], + zzprime[19], zzzprime[19], xxxprime[19]; + + memcpy(origx, x, 10 * sizeof(limb)); + fsum(x, z); + /* |x[i]| < 2^27 */ + fdifference(z, origx); /* does x - z */ + /* |z[i]| < 2^27 */ + + memcpy(origxprime, xprime, sizeof(limb) * 10); + fsum(xprime, zprime); + /* |xprime[i]| < 2^27 */ + fdifference(zprime, origxprime); + /* |zprime[i]| < 2^27 */ + fproduct(xxprime, xprime, z); + /* |xxprime[i]| < 14*2^54: the largest product of two limbs will be < + * 2^(27+27) and fproduct adds together, at most, 14 of those products. + * (Approximating that to 2^58 doesn't work out.) */ + fproduct(zzprime, x, zprime); + /* |zzprime[i]| < 14*2^54 */ + freduce_degree(xxprime); + freduce_coefficients(xxprime); + /* |xxprime[i]| < 2^26 */ + freduce_degree(zzprime); + freduce_coefficients(zzprime); + /* |zzprime[i]| < 2^26 */ + memcpy(origxprime, xxprime, sizeof(limb) * 10); + fsum(xxprime, zzprime); + /* |xxprime[i]| < 2^27 */ + fdifference(zzprime, origxprime); + /* |zzprime[i]| < 2^27 */ + fsquare(xxxprime, xxprime); + /* |xxxprime[i]| < 2^26 */ + fsquare(zzzprime, zzprime); + /* |zzzprime[i]| < 2^26 */ + fproduct(zzprime, zzzprime, qmqp); + /* |zzprime[i]| < 14*2^52 */ + freduce_degree(zzprime); + freduce_coefficients(zzprime); + /* |zzprime[i]| < 2^26 */ + memcpy(x3, xxxprime, sizeof(limb) * 10); + memcpy(z3, zzprime, sizeof(limb) * 10); + + fsquare(xx, x); + /* |xx[i]| < 2^26 */ + fsquare(zz, z); + /* |zz[i]| < 2^26 */ + fproduct(x2, xx, zz); + /* |x2[i]| < 14*2^52 */ + freduce_degree(x2); + freduce_coefficients(x2); + /* |x2[i]| < 2^26 */ + fdifference(zz, xx); // does zz = xx - zz + /* |zz[i]| < 2^27 */ + memset(zzz + 10, 0, sizeof(limb) * 9); + fscalar_product(zzz, zz, 121665); + /* |zzz[i]| < 2^(27+17) */ + /* No need to call freduce_degree here: + fscalar_product doesn't increase the degree of its input. */ + freduce_coefficients(zzz); + /* |zzz[i]| < 2^26 */ + fsum(zzz, xx); + /* |zzz[i]| < 2^27 */ + fproduct(z2, zz, zzz); + /* |z2[i]| < 14*2^(26+27) */ + freduce_degree(z2); + freduce_coefficients(z2); + /* |z2|i| < 2^26 */ +} + +/* Conditionally swap two reduced-form limb arrays if 'iswap' is 1, but leave + * them unchanged if 'iswap' is 0. Runs in data-invariant time to avoid + * side-channel attacks. + * + * NOTE that this function requires that 'iswap' be 1 or 0; other values give + * wrong results. Also, the two limb arrays must be in reduced-coefficient, + * reduced-degree form: the values in a[10..19] or b[10..19] aren't swapped, + * and all all values in a[0..9],b[0..9] must have magnitude less than + * INT32_MAX. */ +static void +swap_conditional(limb a[19], limb b[19], limb iswap) { + unsigned i; + const s32 swap = (s32) -iswap; + + for (i = 0; i < 10; ++i) { + const s32 x = swap & ( ((s32)a[i]) ^ ((s32)b[i]) ); + a[i] = ((s32)a[i]) ^ x; + b[i] = ((s32)b[i]) ^ x; + } +} + +/* Calculates nQ where Q is the x-coordinate of a point on the curve + * + * resultx/resultz: the x coordinate of the resulting curve point (short form) + * n: a little endian, 32-byte number + * q: a point of the curve (short form) */ +static void +cmult(limb *resultx, limb *resultz, const u8 *n, const limb *q) { + limb a[19] = {0}, b[19] = {1}, c[19] = {1}, d[19] = {0}; + limb *nqpqx = a, *nqpqz = b, *nqx = c, *nqz = d, *t; + limb e[19] = {0}, f[19] = {1}, g[19] = {0}, h[19] = {1}; + limb *nqpqx2 = e, *nqpqz2 = f, *nqx2 = g, *nqz2 = h; + + unsigned i, j; + + memcpy(nqpqx, q, sizeof(limb) * 10); + + for (i = 0; i < 32; ++i) { + u8 byte = n[31 - i]; + for (j = 0; j < 8; ++j) { + const limb bit = byte >> 7; + + swap_conditional(nqx, nqpqx, bit); + swap_conditional(nqz, nqpqz, bit); + fmonty(nqx2, nqz2, + nqpqx2, nqpqz2, + nqx, nqz, + nqpqx, nqpqz, + q); + swap_conditional(nqx2, nqpqx2, bit); + swap_conditional(nqz2, nqpqz2, bit); + + t = nqx; + nqx = nqx2; + nqx2 = t; + t = nqz; + nqz = nqz2; + nqz2 = t; + t = nqpqx; + nqpqx = nqpqx2; + nqpqx2 = t; + t = nqpqz; + nqpqz = nqpqz2; + nqpqz2 = t; + + byte <<= 1; + } + } + + memcpy(resultx, nqx, sizeof(limb) * 10); + memcpy(resultz, nqz, sizeof(limb) * 10); +} + +// ----------------------------------------------------------------------------- +// Shamelessly copied from djb's code +// ----------------------------------------------------------------------------- +static void +crecip(limb *out, const limb *z) { + limb z2[10]; + limb z9[10]; + limb z11[10]; + limb z2_5_0[10]; + limb z2_10_0[10]; + limb z2_20_0[10]; + limb z2_50_0[10]; + limb z2_100_0[10]; + limb t0[10]; + limb t1[10]; + int i; + + /* 2 */ fsquare(z2,z); + /* 4 */ fsquare(t1,z2); + /* 8 */ fsquare(t0,t1); + /* 9 */ fmul(z9,t0,z); + /* 11 */ fmul(z11,z9,z2); + /* 22 */ fsquare(t0,z11); + /* 2^5 - 2^0 = 31 */ fmul(z2_5_0,t0,z9); + + /* 2^6 - 2^1 */ fsquare(t0,z2_5_0); + /* 2^7 - 2^2 */ fsquare(t1,t0); + /* 2^8 - 2^3 */ fsquare(t0,t1); + /* 2^9 - 2^4 */ fsquare(t1,t0); + /* 2^10 - 2^5 */ fsquare(t0,t1); + /* 2^10 - 2^0 */ fmul(z2_10_0,t0,z2_5_0); + + /* 2^11 - 2^1 */ fsquare(t0,z2_10_0); + /* 2^12 - 2^2 */ fsquare(t1,t0); + /* 2^20 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } + /* 2^20 - 2^0 */ fmul(z2_20_0,t1,z2_10_0); + + /* 2^21 - 2^1 */ fsquare(t0,z2_20_0); + /* 2^22 - 2^2 */ fsquare(t1,t0); + /* 2^40 - 2^20 */ for (i = 2;i < 20;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } + /* 2^40 - 2^0 */ fmul(t0,t1,z2_20_0); + + /* 2^41 - 2^1 */ fsquare(t1,t0); + /* 2^42 - 2^2 */ fsquare(t0,t1); + /* 2^50 - 2^10 */ for (i = 2;i < 10;i += 2) { fsquare(t1,t0); fsquare(t0,t1); } + /* 2^50 - 2^0 */ fmul(z2_50_0,t0,z2_10_0); + + /* 2^51 - 2^1 */ fsquare(t0,z2_50_0); + /* 2^52 - 2^2 */ fsquare(t1,t0); + /* 2^100 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } + /* 2^100 - 2^0 */ fmul(z2_100_0,t1,z2_50_0); + + /* 2^101 - 2^1 */ fsquare(t1,z2_100_0); + /* 2^102 - 2^2 */ fsquare(t0,t1); + /* 2^200 - 2^100 */ for (i = 2;i < 100;i += 2) { fsquare(t1,t0); fsquare(t0,t1); } + /* 2^200 - 2^0 */ fmul(t1,t0,z2_100_0); + + /* 2^201 - 2^1 */ fsquare(t0,t1); + /* 2^202 - 2^2 */ fsquare(t1,t0); + /* 2^250 - 2^50 */ for (i = 2;i < 50;i += 2) { fsquare(t0,t1); fsquare(t1,t0); } + /* 2^250 - 2^0 */ fmul(t0,t1,z2_50_0); + + /* 2^251 - 2^1 */ fsquare(t1,t0); + /* 2^252 - 2^2 */ fsquare(t0,t1); + /* 2^253 - 2^3 */ fsquare(t1,t0); + /* 2^254 - 2^4 */ fsquare(t0,t1); + /* 2^255 - 2^5 */ fsquare(t1,t0); + /* 2^255 - 21 */ fmul(out,t1,z11); +} + +int +curve25519_donna(u8 *mypublic, const u8 *secret, const u8 *basepoint) { + limb bp[10], x[10], z[11], zmone[10]; + uint8_t e[32]; + int i; + + for (i = 0; i < 32; ++i) e[i] = secret[i]; + e[0] &= 248; + e[31] &= 127; + e[31] |= 64; + + fexpand(bp, basepoint); + cmult(x, z, e, bp); + crecip(zmone, z); + fmul(z, x, zmone); + fcontract(mypublic, z); + return 0; +} diff --git a/curve25519-donna.podspec b/curve25519-donna.podspec new file mode 100644 index 0000000..0f2f31a --- /dev/null +++ b/curve25519-donna.podspec @@ -0,0 +1,13 @@ +Pod::Spec.new do |s| + s.name = "curve25519-donna" + s.version = "1.2.1" + s.summary = "Implementations of a fast elliptic-curve, Diffie-Hellman primitive" + s.description = <<-DESC + Curve25519 is a state-of-the-art Diffie-Hellman function suitable for a wide variety of applications. + DESC + s.homepage = "http://code.google.com/p/curve25519-donna" + s.license = 'BSD 3-Clause' + s.author = 'Dan Bernstein' + s.source = { :git => "https://github.com/agl/curve25519-donna.git", :tag => "1.2.1" } + s.source_files = 'curve25519-donna.c' +end diff --git a/python-src/curve25519/__init__.py b/python-src/curve25519/__init__.py new file mode 100644 index 0000000..873ff57 --- /dev/null +++ b/python-src/curve25519/__init__.py @@ -0,0 +1,4 @@ + +from .keys import Private, Public + +hush_pyflakes = [Private, Public]; del hush_pyflakes diff --git a/python-src/curve25519/curve25519module.c b/python-src/curve25519/curve25519module.c new file mode 100644 index 0000000..e309ec0 --- /dev/null +++ b/python-src/curve25519/curve25519module.c @@ -0,0 +1,105 @@ +/* tell python that PyArg_ParseTuple(t#) means Py_ssize_t, not int */ +#define PY_SSIZE_T_CLEAN +#include <Python.h> +#if (PY_VERSION_HEX < 0x02050000) + typedef int Py_ssize_t; +#endif + +/* This is required for compatibility with Python 2. */ +#if PY_MAJOR_VERSION >= 3 + #include <bytesobject.h> + #define y "y" +#else + #define PyBytes_FromStringAndSize PyString_FromStringAndSize + #define y "t" +#endif + +int curve25519_donna(char *mypublic, + const char *secret, const char *basepoint); + +static PyObject * +pycurve25519_makeprivate(PyObject *self, PyObject *args) +{ + char *in1; + Py_ssize_t in1len; + if (!PyArg_ParseTuple(args, y"#:clamp", &in1, &in1len)) + return NULL; + if (in1len != 32) { + PyErr_SetString(PyExc_ValueError, "input must be 32-byte string"); + return NULL; + } + in1[0] &= 248; + in1[31] &= 127; + in1[31] |= 64; + return PyBytes_FromStringAndSize((char *)in1, 32); +} + +static PyObject * +pycurve25519_makepublic(PyObject *self, PyObject *args) +{ + const char *private; + char mypublic[32]; + char basepoint[32] = {9}; + Py_ssize_t privatelen; + if (!PyArg_ParseTuple(args, y"#:makepublic", &private, &privatelen)) + return NULL; + if (privatelen != 32) { + PyErr_SetString(PyExc_ValueError, "input must be 32-byte string"); + return NULL; + } + curve25519_donna(mypublic, private, basepoint); + return PyBytes_FromStringAndSize((char *)mypublic, 32); +} + +static PyObject * +pycurve25519_makeshared(PyObject *self, PyObject *args) +{ + const char *myprivate, *theirpublic; + char shared_key[32]; + Py_ssize_t myprivatelen, theirpubliclen; + if (!PyArg_ParseTuple(args, y"#"y"#:generate", + &myprivate, &myprivatelen, &theirpublic, &theirpubliclen)) + return NULL; + if (myprivatelen != 32) { + PyErr_SetString(PyExc_ValueError, "input must be 32-byte string"); + return NULL; + } + if (theirpubliclen != 32) { + PyErr_SetString(PyExc_ValueError, "input must be 32-byte string"); + return NULL; + } + curve25519_donna(shared_key, myprivate, theirpublic); + return PyBytes_FromStringAndSize((char *)shared_key, 32); +} + + +static PyMethodDef +curve25519_functions[] = { + {"make_private", pycurve25519_makeprivate, METH_VARARGS, "data->private"}, + {"make_public", pycurve25519_makepublic, METH_VARARGS, "private->public"}, + {"make_shared", pycurve25519_makeshared, METH_VARARGS, "private+public->shared"}, + {NULL, NULL, 0, NULL}, +}; + +#if PY_MAJOR_VERSION >= 3 + static struct PyModuleDef + curve25519_module = { + PyModuleDef_HEAD_INIT, + "_curve25519", + NULL, + NULL, + curve25519_functions, + }; + + PyObject * + PyInit__curve25519(void) + { + return PyModule_Create(&curve25519_module); + } +#else + PyMODINIT_FUNC + init_curve25519(void) + { + (void)Py_InitModule("_curve25519", curve25519_functions); + } +#endif
\ No newline at end of file diff --git a/python-src/curve25519/keys.py b/python-src/curve25519/keys.py new file mode 100644 index 0000000..e131dac --- /dev/null +++ b/python-src/curve25519/keys.py @@ -0,0 +1,46 @@ +from . import _curve25519 +from hashlib import sha256 +import os + +# the curve25519 functions are really simple, and could be used without an +# OOP layer, but it's a bit too easy to accidentally swap the private and +# public keys that way. + +def _hash_shared(shared): + return sha256(b"curve25519-shared:"+shared).digest() + +class Private: + def __init__(self, secret=None, seed=None): + if secret is None: + if seed is None: + secret = os.urandom(32) + else: + secret = sha256(b"curve25519-private:"+seed).digest() + else: + assert seed is None, "provide secret, seed, or neither, not both" + if not isinstance(secret, bytes) or len(secret) != 32: + raise TypeError("secret= must be 32-byte string") + self.private = _curve25519.make_private(secret) + + def serialize(self): + return self.private + + def get_public(self): + return Public(_curve25519.make_public(self.private)) + + def get_shared_key(self, public, hashfunc=None): + if not isinstance(public, Public): + raise ValueError("'public' must be an instance of Public") + if hashfunc is None: + hashfunc = _hash_shared + shared = _curve25519.make_shared(self.private, public.public) + return hashfunc(shared) + +class Public: + def __init__(self, public): + assert isinstance(public, bytes) + assert len(public) == 32 + self.public = public + + def serialize(self): + return self.public diff --git a/python-src/curve25519/test/__init__.py b/python-src/curve25519/test/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/python-src/curve25519/test/__init__.py diff --git a/python-src/curve25519/test/test_curve25519.py b/python-src/curve25519/test/test_curve25519.py new file mode 100755 index 0000000..2ecbd47 --- /dev/null +++ b/python-src/curve25519/test/test_curve25519.py @@ -0,0 +1,99 @@ +#! /usr/bin/python + +import unittest + +from curve25519 import Private, Public +from hashlib import sha1, sha256 +from binascii import hexlify + +class Basic(unittest.TestCase): + def test_basic(self): + secret1 = b"abcdefghijklmnopqrstuvwxyz123456" + self.assertEqual(len(secret1), 32) + + secret2 = b"654321zyxwvutsrqponmlkjihgfedcba" + self.assertEqual(len(secret2), 32) + priv1 = Private(secret=secret1) + pub1 = priv1.get_public() + priv2 = Private(secret=secret2) + pub2 = priv2.get_public() + shared12 = priv1.get_shared_key(pub2) + e = b"b0818125eab42a8ac1af5e8b9b9c15ed2605c2bbe9675de89e5e6e7f442b9598" + self.assertEqual(hexlify(shared12), e) + shared21 = priv2.get_shared_key(pub1) + self.assertEqual(shared12, shared21) + + pub2a = Public(pub2.serialize()) + shared12a = priv1.get_shared_key(pub2a) + self.assertEqual(hexlify(shared12a), e) + + def test_errors(self): + priv1 = Private() + self.assertRaises(ValueError, priv1.get_shared_key, priv1) + + def test_seed(self): + # use 32-byte secret + self.assertRaises(TypeError, Private, secret=123) + self.assertRaises(TypeError, Private, secret=b"too short") + secret1 = b"abcdefghijklmnopqrstuvwxyz123456" + assert len(secret1) == 32 + priv1 = Private(secret=secret1) + priv1a = Private(secret=secret1) + priv1b = Private(priv1.serialize()) + self.assertEqual(priv1.serialize(), priv1a.serialize()) + self.assertEqual(priv1.serialize(), priv1b.serialize()) + e = b"6062636465666768696a6b6c6d6e6f707172737475767778797a313233343576" + self.assertEqual(hexlify(priv1.serialize()), e) + + # the private key is a clamped form of the secret, so they won't + # quite be the same + p = Private(secret=b"\x00"*32) + self.assertEqual(hexlify(p.serialize()), b"00"*31+b"40") + p = Private(secret=b"\xff"*32) + self.assertEqual(hexlify(p.serialize()), b"f8"+b"ff"*30+b"7f") + + # use arbitrary-length seed + self.assertRaises(TypeError, Private, seed=123) + priv1 = Private(seed=b"abc") + priv1a = Private(seed=b"abc") + priv1b = Private(priv1.serialize()) + self.assertEqual(priv1.serialize(), priv1a.serialize()) + self.assertEqual(priv1.serialize(), priv1b.serialize()) + self.assertRaises(AssertionError, Private, seed=b"abc", secret=b"no") + + priv1 = Private(seed=b"abc") + priv1a = Private(priv1.serialize()) + self.assertEqual(priv1.serialize(), priv1a.serialize()) + self.assertRaises(AssertionError, Private, seed=b"abc", secret=b"no") + + # use built-in os.urandom + priv2 = Private() + priv2a = Private(priv2.private) + self.assertEqual(priv2.serialize(), priv2a.serialize()) + + # attempt to use both secret= and seed=, not allowed + self.assertRaises(AssertionError, Private, seed=b"abc", secret=b"no") + + def test_hashfunc(self): + priv1 = Private(seed=b"abc") + priv2 = Private(seed=b"def") + shared_sha256 = priv1.get_shared_key(priv2.get_public()) + e = b"da959ffe77ebeb4757fe5ba310e28ede425ae0d0ff5ec9c884e2d08f311cf5e5" + self.assertEqual(hexlify(shared_sha256), e) + + # confirm the hash function remains what we think it is + def myhash(shared_key): + return sha256(b"curve25519-shared:"+shared_key).digest() + shared_myhash = priv1.get_shared_key(priv2.get_public(), myhash) + self.assertEqual(hexlify(shared_myhash), e) + + def hexhash(shared_key): + return sha1(shared_key).hexdigest().encode() + shared_hexhash = priv1.get_shared_key(priv2.get_public(), hexhash) + self.assertEqual(shared_hexhash, + b"80eec98222c8edc4324fb9477a3c775ce7c6c93a") + + +if __name__ == "__main__": + unittest.main() + diff --git a/python-src/curve25519/test/test_speed.py b/python-src/curve25519/test/test_speed.py new file mode 100755 index 0000000..87952fa --- /dev/null +++ b/python-src/curve25519/test/test_speed.py @@ -0,0 +1,46 @@ +#! /usr/bin/python + +from time import time +from curve25519 import Private + +count = 10000 +elapsed_get_public = 0.0 +elapsed_get_shared = 0.0 + +def abbreviate_time(data): + # 1.23s, 790ms, 132us + if data is None: + return "" + s = float(data) + if s >= 10: + #return abbreviate.abbreviate_time(data) + return "%d" % s + if s >= 1.0: + return "%.2fs" % s + if s >= 0.01: + return "%dms" % (1000*s) + if s >= 0.001: + return "%.1fms" % (1000*s) + if s >= 0.000001: + return "%.1fus" % (1000000*s) + return "%dns" % (1000000000*s) + +def nohash(key): return key + +for i in range(count): + p = Private() + start = time() + pub = p.get_public() + elapsed_get_public += time() - start + pub2 = Private().get_public() + start = time() + shared = p.get_shared_key(pub2) #, hashfunc=nohash) + elapsed_get_shared += time() - start + +print("get_public: %s" % abbreviate_time(elapsed_get_public / count)) +print("get_shared: %s" % abbreviate_time(elapsed_get_shared / count)) + +# these take about 560us-570us each (with the default compiler settings, -Os) +# on my laptop, same with -O2 +# of which the python overhead is about 5us +# and the get_shared_key() hash step adds about 5us diff --git a/setup.py b/setup.py new file mode 100755 index 0000000..dc1b8eb --- /dev/null +++ b/setup.py @@ -0,0 +1,38 @@ +#! /usr/bin/python + +from subprocess import Popen, PIPE +from distutils.core import setup, Extension + +version = Popen(["git", "describe", "--tags"], stdout=PIPE).communicate()[0]\ + .strip().decode("utf8") + +ext_modules = [Extension("curve25519._curve25519", + ["python-src/curve25519/curve25519module.c", + "curve25519-donna.c"], + )] + +short_description="Python wrapper for the Curve25519 cryptographic library" +long_description="""\ +Curve25519 is a fast elliptic-curve key-agreement protocol, in which two +parties Alice and Bob each generate a (public,private) keypair, exchange +public keys, and can then compute the same shared key. Specifically, Alice +computes F(Aprivate, Bpublic), Bob computes F(Bprivate, Apublic), and both +get the same value (and nobody else can guess that shared value, even if they +know Apublic and Bpublic). + +This is a Python wrapper for the portable 'curve25519-donna' implementation +of this algorithm, written by Adam Langley, hosted at +http://code.google.com/p/curve25519-donna/ +""" + +setup(name="curve25519-donna", + version=version, + description=short_description, + long_description=long_description, + author="Brian Warner", + author_email="warner-pycurve25519-donna@lothar.com", + license="BSD", + packages=["curve25519", "curve25519.test"], + package_dir={"curve25519": "python-src/curve25519"}, + ext_modules=ext_modules, + ) diff --git a/speed-curve25519.c b/speed-curve25519.c new file mode 100644 index 0000000..d945d48 --- /dev/null +++ b/speed-curve25519.c @@ -0,0 +1,50 @@ +#include <stdio.h> +#include <string.h> +#include <sys/time.h> +#include <time.h> +#include <stdint.h> + +typedef uint8_t u8; + +extern void curve25519_donna(u8 *output, const u8 *secret, const u8 *bp); + +static uint64_t +time_now() { + struct timeval tv; + uint64_t ret; + + gettimeofday(&tv, NULL); + ret = tv.tv_sec; + ret *= 1000000; + ret += tv.tv_usec; + + return ret; +} + +int +main() { + static const unsigned char basepoint[32] = {9}; + unsigned char mysecret[32], mypublic[32]; + unsigned i; + uint64_t start, end; + + memset(mysecret, 42, 32); + mysecret[0] &= 248; + mysecret[31] &= 127; + mysecret[31] |= 64; + + // Load the caches + for (i = 0; i < 1000; ++i) { + curve25519_donna(mypublic, mysecret, basepoint); + } + + start = time_now(); + for (i = 0; i < 30000; ++i) { + curve25519_donna(mypublic, mysecret, basepoint); + } + end = time_now(); + + printf("%luus\n", (unsigned long) ((end - start) / 30000)); + + return 0; +} diff --git a/test-curve25519.c b/test-curve25519.c new file mode 100644 index 0000000..591d871 --- /dev/null +++ b/test-curve25519.c @@ -0,0 +1,54 @@ +/* +test-curve25519 version 20050915 +D. J. Bernstein +Public domain. + +Tiny modifications by agl +*/ + +#include <stdio.h> + +extern void curve25519_donna(unsigned char *output, const unsigned char *a, + const unsigned char *b); +void doit(unsigned char *ek,unsigned char *e,unsigned char *k); + +void doit(unsigned char *ek,unsigned char *e,unsigned char *k) +{ + int i; + + for (i = 0;i < 32;++i) printf("%02x",(unsigned int) e[i]); printf(" "); + for (i = 0;i < 32;++i) printf("%02x",(unsigned int) k[i]); printf(" "); + curve25519_donna(ek,e,k); + for (i = 0;i < 32;++i) printf("%02x",(unsigned int) ek[i]); printf("\n"); +} + +unsigned char e1k[32]; +unsigned char e2k[32]; +unsigned char e1e2k[32]; +unsigned char e2e1k[32]; +unsigned char e1[32] = {3}; +unsigned char e2[32] = {5}; +unsigned char k[32] = {9}; + +int +main() +{ + int loop; + int i; + + for (loop = 0;loop < 10000;++loop) { + doit(e1k,e1,k); + doit(e2e1k,e2,e1k); + doit(e2k,e2,k); + doit(e1e2k,e1,e2k); + for (i = 0;i < 32;++i) if (e1e2k[i] != e2e1k[i]) { + printf("fail\n"); + return 1; + } + for (i = 0;i < 32;++i) e1[i] ^= e2k[i]; + for (i = 0;i < 32;++i) e2[i] ^= e1k[i]; + for (i = 0;i < 32;++i) k[i] ^= e1e2k[i]; + } + + return 0; +} diff --git a/test-noncanon.c b/test-noncanon.c new file mode 100644 index 0000000..6de4e8d --- /dev/null +++ b/test-noncanon.c @@ -0,0 +1,39 @@ +/* This file can be used to test whether the code handles non-canonical curve + * points (i.e. points with the 256th bit set) in the same way as the reference + * implementation. */ + +#include <stdint.h> +#include <stdio.h> +#include <string.h> + +extern void curve25519_donna(unsigned char *output, const unsigned char *a, + const unsigned char *b); +int +main() +{ + static const uint8_t point1[32] = { + 0x25,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + }; + static const uint8_t point2[32] = { + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, + 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, + }; + static const uint8_t scalar[32] = { 1 }; + uint8_t out1[32], out2[32]; + + curve25519_donna(out1, scalar, point1); + curve25519_donna(out2, scalar, point2); + + if (0 == memcmp(out1, out2, sizeof(out1))) { + fprintf(stderr, "Top bit not ignored.\n"); + return 1; + } + + fprintf(stderr, "Top bit correctly ignored.\n"); + return 0; +} diff --git a/test-sc-curve25519.c b/test-sc-curve25519.c new file mode 100644 index 0000000..14a7e3c --- /dev/null +++ b/test-sc-curve25519.c @@ -0,0 +1,72 @@ +#define _GNU_SOURCE + +#include <stdio.h> +#include <string.h> +#include <stdint.h> +#include <math.h> + +extern void curve25519_donna(uint8_t *, const uint8_t *, const uint8_t *); +extern uint64_t tsc_read(); + +int +main(int argc, char **argv) { + uint8_t private_key[32], public[32], peer1[32], peer2[32], output[32]; + static const uint8_t basepoint[32] = {9}; + unsigned i; + uint64_t sum = 0, sum_squares = 0, skipped = 0, mean; + static const unsigned count = 200000; + + memset(private_key, 42, sizeof(private_key)); + + private_key[0] &= 248; + private_key[31] &= 127; + private_key[31] |= 64; + + curve25519_donna(public, private_key, basepoint); + memset(peer1, 0, sizeof(peer1)); + memset(peer2, 255, sizeof(peer2)); + + for (i = 0; i < count; ++i) { + const uint64_t start = tsc_read(); + curve25519_donna(output, peer1, public); + const uint64_t end = tsc_read(); + const uint64_t delta = end - start; + if (delta > 650000) { + // something terrible happened (task switch etc) + skipped++; + continue; + } + sum += delta; + sum_squares += (delta * delta); + } + + mean = sum / ((uint64_t) count); + printf("all 0: mean:%lu sd:%f skipped:%lu\n", + mean, + sqrt((double)(sum_squares/((uint64_t) count) - mean*mean)), + skipped); + + sum = sum_squares = skipped = 0; + + for (i = 0; i < count; ++i) { + const uint64_t start = tsc_read(); + curve25519_donna(output, peer2, public); + const uint64_t end = tsc_read(); + const uint64_t delta = end - start; + if (delta > 650000) { + // something terrible happened (task switch etc) + skipped++; + continue; + } + sum += delta; + sum_squares += (delta * delta); + } + + mean = sum / ((uint64_t) count); + printf("all 1: mean:%lu sd:%f skipped:%lu\n", + mean, + sqrt((double)(sum_squares/((uint64_t) count) - mean*mean)), + skipped); + + return 0; +} diff --git a/test-sc-curve25519.s b/test-sc-curve25519.s new file mode 100644 index 0000000..1da4f68 --- /dev/null +++ b/test-sc-curve25519.s @@ -0,0 +1,8 @@ +.text +.globl tsc_read + +tsc_read: +rdtsc +shl $32,%rdx +or %rdx,%rax +ret |