#include <stddef.h>

#include <stddef.h>
#include <inttypes.h>

/*
* "constant time" memcmp. Time taken depends on the buffer length, of
* course, but not on the content of the buffers.
*
* Just like the ordinary memcmp function, the return value is
* tri-state: <0, 0, or >0. However, applications that need a
* constant-time memory comparison function usually need only a
* two-state result, signalling only whether the inputs were identical
* or different, but not signalling which of the inputs was larger.
* This code could be made significantly faster and simpler if the
* requirement for a tri-state result were removed.
*
* In order to protect against adversaries who can observe timing,
* cache hits or misses, page faults, etc., and who can use such
* observations to learn something about the relationship between the
* contents of the two buffers, we have to perform exactly the same
* instructions and memory accesses regardless of the contents of the
* buffers. We can't stop as soon as we find a difference, we can't
* take different conditional branches depending on the data, and we
* can't use different pointers or array indexes depending on the data.
*
* Further reading:
*
* .Rs
* .%A Paul C. Kocher
* .%T Timing Attacks on Implementations of Diffie-Hellman, RSA, DSS, and Other Systems
* .%D 1996
* .%J CRYPTO 1996
* .%P 104-113
* .%U http://www.cryptography.com/timingattack/paper.html
* .%U http://www.webcitation.org/query?url=http%3A%2F%2Fwww.cryptography.com%2Ftimingattack%2Fpaper.html&date=2012-10-17
* .Re
*
* .Rs
* .%A D. Boneh
* .%A D. Brumley
* .%T Remote timing attacks are practical
* .%D August 2003
* .%J Proceedings of the 12th Usenix Security Symposium, 2003
* .%U https://crypto.stanford.edu/~dabo/abstracts/ssl-timing.html
* .%U http://www.webcitation.org/query?url=https%3A%2F%2Fcrypto.stanford.edu%2F%7Edabo%2Fabstracts%2Fssl-timing.html&date=2012-10-17
* .%U http://www.webcitation.org/query?url=http%3A%2F%2Fcrypto.stanford.edu%2F%7Edabo%2Fpubs%2Fpapers%2Fssl-timing.pdf&date=2012-10-17
* .Es
*
* .Rs
* .%A Coda Hale
* .%T A Lesson In Timing Attacks (or, Don't use MessageDigest.isEquals)
* .%D 13 Aug 2009
* .%U http://codahale.com/a-lesson-in-timing-attacks/
* .%U http://www.webcitation.org/query?url=http%3A%2F%2Fcodahale.com%2Fa-lesson-in-timing-attacks%2F&date=2012-10-17
* .Re
*
*/

/*
* A note on portability:
*
* We assume that char is exactly 8 bits, the same as uint8_t, and that
* integer types with exactly 16 bits and exactly 32 bits exist. (If
* there is ever a need to change this, then the actual requirement is
* that we need a type that is at least two bits wider than char, and
* another type that is at least two bits wider than that, or we need to
* fake it somehow.)
*
* We do not assume any particular size for the plain "int" type, except
* that it is at least 16 bits, as is guaranteed by the C language
* standard.
*
* We do not assume that signed integer overflow is harmless. We
* ensure that signed integer overflow does not occur, so that
* implementation-defined overflow behaviour is not invoked.
*
* We rely on the C standard's guarantees regarding the wraparound
* behaviour of unsigned integer arithmetic, and on the analagous
* guarantees regarding conversions from signed types to narrower
* unsigned types.
*
* We do not assume that the platform uses two's complement arithmetic.
*/

/*
* How hard do we have to try to prevent unwanted compiler optimisations?
*
* Try compiling with "#define USE_VOLATILE_TEMPORARY 0", and examine
* the compiler output. If the only conditional tests in the entire
* function are to test whether len is zero, then all is well, but try
* again with different optimisation flags to be sure. If the compiler
* emitted code with conditional tests that do anything other than
* testing whether len is zero, then that's a problem, so try again with
* "#define USE_VOLATILE_TEMPORARY 1". If it's still bad, then you are
* out of luck.
*/
#define USE_VOLATILE_TEMPORARY 0

int consttime_memcmp(const void *b1, const void *b2, size_t len)
{
const uint8_t *c1, *c2;
uint16_t d, r, m;

#if USE_VOLATILE_TEMPORARY
volatile uint16_t v;
#else
uint16_t v;
#endif

c1 = b1;
c2 = b2;

r = 0;
while (len) {
/*
* Take the low 8 bits of r (in the range 0x00 to 0xff,
* or 0 to 255);
* As explained elsewhere, the low 8 bits of r will be zero
* if and only if all bytes compared so far were identical;
* Zero-extend to a 16-bit type (in the range 0x0000 to
* 0x00ff);
* Add 255, yielding a result in the range 255 to 510;
* Save that in a volatile variable to prevent
* the compiler from trying any shortcuts (the
* use of a volatile variable depends on "#ifdef
* USE_VOLATILE_TEMPORARY", and most compilers won't
* need it);
* Divide by 256 yielding a result of 1 if the original
* value of r was non-zero, or 0 if r was zero;
* Subtract 1, yielding 0 if r was non-zero, or -1 if r
* was zero;
* Convert to uint16_t, yielding 0x0000 if r was
* non-zero, or 0xffff if r was zero;
* Save in m.
*/
v = ((uint16_t)(uint8_t)r)+255;
m = v/256-1;

/*
* Get the values from *c1 and *c2 as uint8_t (each will
* be in the range 0 to 255, or 0x00 to 0xff);
* Convert them to signed int values (still in the
* range 0 to 255);
* Subtract them using signed arithmetic, yielding a
* result in the range -255 to +255;
* Convert to uint16_t, yielding a result in the range
* 0xff01 to 0xffff (for what was previously -255 to
* -1), or 0, or in the range 0x0001 to 0x00ff (for what
* was previously +1 to +255).
*/
d = (uint16_t)((int)*c1 - (int)*c2);

/*
* If the low 8 bits of r were previously 0, then m
* is now 0xffff, so (d & m) is the same as d, so we
* effectively copy d to r;
* Otherwise, if r was previously non-zero, then m is
* now 0, so (d & m) is zero, so leave r unchanged.
* Note that the low 8 bits of d will be zero if and
* only if d == 0, which happens when *c1 == *c2.
* The low 8 bits of r are thus zero if and only if the
* entirety of r is zero, which happens if and only if
* all bytes compared so far were equal. As soon as a
* non-zero value is stored in r, it remains unchanged
* for the remainder of the loop.
*/
r |= (d & m);

/*
* Increment pointers, decrement length, and loop.
*/
++c1;
++c2;
--len;
}

/*
* At this point, r is an unsigned value, which will be 0 if the
* final result should be zero, or in the range 0x0001 to 0x00ff
* (1 to 255) if the final result should be positive, or in the
* range 0xff01 to 0xffff (65281 to 65535) if the final result
* should be negative.
*
* We want to convert the unsigned values in the range 0xff01
* to 0xffff to signed values in the range -255 to -1, while
* converting the other unsigned values to equivalent signed
* values (0, or +1 to +255).
*
* On a machine with two's complement arithmetic, simply copying
* the underlying bits (with sign extension if int is wider than
* 16 bits) would do the job, so something like this might work:
*
* return (int16_t)r;
*
* However, that invokes implementation-defined behaviour,
* because values larger than 32767 can't fit in a signed 16-bit
* integer without overflow.
*
* To avoid any implementation-defined behaviour, we go through
* these contortions:
*
* a. Calculate ((uint32_t)r + 0x8000). The cast to uint32_t
* it to prevent problems on platforms where int is narrower
* than 32 bits. If int is a larger than 32-bits, then the
* usual arithmetic conversions cause this addition to be
* done in unsigned int arithmetic. If int is 32 bits
* or narrower, then this addition is done in uint32_t
* arithmetic. In either case, no overflow or wraparound
* occurs, and the result from this step has a value that
* will be one of 0x00008000 (32768), or in the range
* 0x00008001 to 0x000080ff (32769 to 33023), or in the range
* 0x00017f01 to 0x00017fff (98049 to 98303).
*
* b. Cast the result from (a) to uint16_t. This effectively
* discards the high bits of the result, in a way that is
* well defined by the C language. The result from this step
* will be of type uint16_t, and its value will be one of
* 0x8000 (32768), or in the range 0x8001 to 0x80ff (32769 to
* 33023), or in the range 0x7f01 to 0x7fff (32513 to
* 32767).
*
* c. Cast the result from (b) to int32_t. We use int32_t
* instead of int because we need a type that's strictly
* larger than 16 bits, and the C standard allows
* implementations where int is only 16 bits. The result
* from this step will be of type int32_t, and its value wll
* be one of 0x00008000 (32768), or in the range 0x00008001
* to 0x000080ff (32769 to 33023), or in the range 0x00007f01
* to 0x00007fff (32513 to 32767).
*
* d. Take the result from (c) and subtract 0x8000 (32768) using
* signed int32_t arithmetic. The result from this step will
* be of type int32_t and the value will be one of
* 0x00000000 (0), or in the range 0x00000001 to 0x000000ff
* (+1 to +255), or in the range 0xffffff01 to 0xffffffff
* (-255 to -1).
*
* e. Cast the result from (d) to int. This does nothing
* interesting, except to make explicit what would have been
* implicit in the return statement. The final result is an
* int in the range -255 to +255.
*
* Unfortunately, compilers don't seem to be good at figuring
* out that most of this can be optimised away by careful choice
* of register width and sign extension.
*
*/
return (/*e*/ int)(/*d*/
(/*c*/ int32_t)(/*b*/ uint16_t)(/*a*/ (uint32_t)r + 0x8000)
- 0x8000);
}