diff --git a/src/secp256k1/src/field_10x26_impl.h b/src/secp256k1/src/field_10x26_impl.h
index 62bffdc21..3539d5b89 100644
--- a/src/secp256k1/src/field_10x26_impl.h
+++ b/src/secp256k1/src/field_10x26_impl.h
@@ -1,1167 +1,1294 @@
 /***********************************************************************
  * Copyright (c) 2013, 2014 Pieter Wuille                              *
  * Distributed under the MIT software license, see the accompanying    *
  * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
  ***********************************************************************/
 
 #ifndef SECP256K1_FIELD_REPR_IMPL_H
 #define SECP256K1_FIELD_REPR_IMPL_H
 
 #include "util.h"
 #include "field.h"
 
 #ifdef VERIFY
 static void secp256k1_fe_verify(const secp256k1_fe *a) {
     const uint32_t *d = a->n;
     int m = a->normalized ? 1 : 2 * a->magnitude, r = 1;
     r &= (d[0] <= 0x3FFFFFFUL * m);
     r &= (d[1] <= 0x3FFFFFFUL * m);
     r &= (d[2] <= 0x3FFFFFFUL * m);
     r &= (d[3] <= 0x3FFFFFFUL * m);
     r &= (d[4] <= 0x3FFFFFFUL * m);
     r &= (d[5] <= 0x3FFFFFFUL * m);
     r &= (d[6] <= 0x3FFFFFFUL * m);
     r &= (d[7] <= 0x3FFFFFFUL * m);
     r &= (d[8] <= 0x3FFFFFFUL * m);
     r &= (d[9] <= 0x03FFFFFUL * m);
     r &= (a->magnitude >= 0);
     r &= (a->magnitude <= 32);
     if (a->normalized) {
         r &= (a->magnitude <= 1);
         if (r && (d[9] == 0x03FFFFFUL)) {
             uint32_t mid = d[8] & d[7] & d[6] & d[5] & d[4] & d[3] & d[2];
             if (mid == 0x3FFFFFFUL) {
                 r &= ((d[1] + 0x40UL + ((d[0] + 0x3D1UL) >> 26)) <= 0x3FFFFFFUL);
             }
         }
     }
     VERIFY_CHECK(r == 1);
 }
 #endif
 
 static void secp256k1_fe_normalize(secp256k1_fe *r) {
     uint32_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4],
              t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8], t9 = r->n[9];
 
     /* Reduce t9 at the start so there will be at most a single carry from the first pass */
     uint32_t m;
     uint32_t x = t9 >> 22; t9 &= 0x03FFFFFUL;
 
     /* The first pass ensures the magnitude is 1, ... */
     t0 += x * 0x3D1UL; t1 += (x << 6);
     t1 += (t0 >> 26); t0 &= 0x3FFFFFFUL;
     t2 += (t1 >> 26); t1 &= 0x3FFFFFFUL;
     t3 += (t2 >> 26); t2 &= 0x3FFFFFFUL; m = t2;
     t4 += (t3 >> 26); t3 &= 0x3FFFFFFUL; m &= t3;
     t5 += (t4 >> 26); t4 &= 0x3FFFFFFUL; m &= t4;
     t6 += (t5 >> 26); t5 &= 0x3FFFFFFUL; m &= t5;
     t7 += (t6 >> 26); t6 &= 0x3FFFFFFUL; m &= t6;
     t8 += (t7 >> 26); t7 &= 0x3FFFFFFUL; m &= t7;
     t9 += (t8 >> 26); t8 &= 0x3FFFFFFUL; m &= t8;
 
     /* ... except for a possible carry at bit 22 of t9 (i.e. bit 256 of the field element) */
     VERIFY_CHECK(t9 >> 23 == 0);
 
     /* At most a single final reduction is needed; check if the value is >= the field characteristic */
     x = (t9 >> 22) | ((t9 == 0x03FFFFFUL) & (m == 0x3FFFFFFUL)
         & ((t1 + 0x40UL + ((t0 + 0x3D1UL) >> 26)) > 0x3FFFFFFUL));
 
     /* Apply the final reduction (for constant-time behaviour, we do it always) */
     t0 += x * 0x3D1UL; t1 += (x << 6);
     t1 += (t0 >> 26); t0 &= 0x3FFFFFFUL;
     t2 += (t1 >> 26); t1 &= 0x3FFFFFFUL;
     t3 += (t2 >> 26); t2 &= 0x3FFFFFFUL;
     t4 += (t3 >> 26); t3 &= 0x3FFFFFFUL;
     t5 += (t4 >> 26); t4 &= 0x3FFFFFFUL;
     t6 += (t5 >> 26); t5 &= 0x3FFFFFFUL;
     t7 += (t6 >> 26); t6 &= 0x3FFFFFFUL;
     t8 += (t7 >> 26); t7 &= 0x3FFFFFFUL;
     t9 += (t8 >> 26); t8 &= 0x3FFFFFFUL;
 
     /* If t9 didn't carry to bit 22 already, then it should have after any final reduction */
     VERIFY_CHECK(t9 >> 22 == x);
 
     /* Mask off the possible multiple of 2^256 from the final reduction */
     t9 &= 0x03FFFFFUL;
 
     r->n[0] = t0; r->n[1] = t1; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4;
     r->n[5] = t5; r->n[6] = t6; r->n[7] = t7; r->n[8] = t8; r->n[9] = t9;
 
 #ifdef VERIFY
     r->magnitude = 1;
     r->normalized = 1;
     secp256k1_fe_verify(r);
 #endif
 }
 
 static void secp256k1_fe_normalize_weak(secp256k1_fe *r) {
     uint32_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4],
              t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8], t9 = r->n[9];
 
     /* Reduce t9 at the start so there will be at most a single carry from the first pass */
     uint32_t x = t9 >> 22; t9 &= 0x03FFFFFUL;
 
     /* The first pass ensures the magnitude is 1, ... */
     t0 += x * 0x3D1UL; t1 += (x << 6);
     t1 += (t0 >> 26); t0 &= 0x3FFFFFFUL;
     t2 += (t1 >> 26); t1 &= 0x3FFFFFFUL;
     t3 += (t2 >> 26); t2 &= 0x3FFFFFFUL;
     t4 += (t3 >> 26); t3 &= 0x3FFFFFFUL;
     t5 += (t4 >> 26); t4 &= 0x3FFFFFFUL;
     t6 += (t5 >> 26); t5 &= 0x3FFFFFFUL;
     t7 += (t6 >> 26); t6 &= 0x3FFFFFFUL;
     t8 += (t7 >> 26); t7 &= 0x3FFFFFFUL;
     t9 += (t8 >> 26); t8 &= 0x3FFFFFFUL;
 
     /* ... except for a possible carry at bit 22 of t9 (i.e. bit 256 of the field element) */
     VERIFY_CHECK(t9 >> 23 == 0);
 
     r->n[0] = t0; r->n[1] = t1; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4;
     r->n[5] = t5; r->n[6] = t6; r->n[7] = t7; r->n[8] = t8; r->n[9] = t9;
 
 #ifdef VERIFY
     r->magnitude = 1;
     secp256k1_fe_verify(r);
 #endif
 }
 
 static void secp256k1_fe_normalize_var(secp256k1_fe *r) {
     uint32_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4],
              t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8], t9 = r->n[9];
 
     /* Reduce t9 at the start so there will be at most a single carry from the first pass */
     uint32_t m;
     uint32_t x = t9 >> 22; t9 &= 0x03FFFFFUL;
 
     /* The first pass ensures the magnitude is 1, ... */
     t0 += x * 0x3D1UL; t1 += (x << 6);
     t1 += (t0 >> 26); t0 &= 0x3FFFFFFUL;
     t2 += (t1 >> 26); t1 &= 0x3FFFFFFUL;
     t3 += (t2 >> 26); t2 &= 0x3FFFFFFUL; m = t2;
     t4 += (t3 >> 26); t3 &= 0x3FFFFFFUL; m &= t3;
     t5 += (t4 >> 26); t4 &= 0x3FFFFFFUL; m &= t4;
     t6 += (t5 >> 26); t5 &= 0x3FFFFFFUL; m &= t5;
     t7 += (t6 >> 26); t6 &= 0x3FFFFFFUL; m &= t6;
     t8 += (t7 >> 26); t7 &= 0x3FFFFFFUL; m &= t7;
     t9 += (t8 >> 26); t8 &= 0x3FFFFFFUL; m &= t8;
 
     /* ... except for a possible carry at bit 22 of t9 (i.e. bit 256 of the field element) */
     VERIFY_CHECK(t9 >> 23 == 0);
 
     /* At most a single final reduction is needed; check if the value is >= the field characteristic */
     x = (t9 >> 22) | ((t9 == 0x03FFFFFUL) & (m == 0x3FFFFFFUL)
         & ((t1 + 0x40UL + ((t0 + 0x3D1UL) >> 26)) > 0x3FFFFFFUL));
 
     if (x) {
         t0 += 0x3D1UL; t1 += (x << 6);
         t1 += (t0 >> 26); t0 &= 0x3FFFFFFUL;
         t2 += (t1 >> 26); t1 &= 0x3FFFFFFUL;
         t3 += (t2 >> 26); t2 &= 0x3FFFFFFUL;
         t4 += (t3 >> 26); t3 &= 0x3FFFFFFUL;
         t5 += (t4 >> 26); t4 &= 0x3FFFFFFUL;
         t6 += (t5 >> 26); t5 &= 0x3FFFFFFUL;
         t7 += (t6 >> 26); t6 &= 0x3FFFFFFUL;
         t8 += (t7 >> 26); t7 &= 0x3FFFFFFUL;
         t9 += (t8 >> 26); t8 &= 0x3FFFFFFUL;
 
         /* If t9 didn't carry to bit 22 already, then it should have after any final reduction */
         VERIFY_CHECK(t9 >> 22 == x);
 
         /* Mask off the possible multiple of 2^256 from the final reduction */
         t9 &= 0x03FFFFFUL;
     }
 
     r->n[0] = t0; r->n[1] = t1; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4;
     r->n[5] = t5; r->n[6] = t6; r->n[7] = t7; r->n[8] = t8; r->n[9] = t9;
 
 #ifdef VERIFY
     r->magnitude = 1;
     r->normalized = 1;
     secp256k1_fe_verify(r);
 #endif
 }
 
 static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r) {
     uint32_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4],
              t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8], t9 = r->n[9];
 
     /* z0 tracks a possible raw value of 0, z1 tracks a possible raw value of P */
     uint32_t z0, z1;
 
     /* Reduce t9 at the start so there will be at most a single carry from the first pass */
     uint32_t x = t9 >> 22; t9 &= 0x03FFFFFUL;
 
     /* The first pass ensures the magnitude is 1, ... */
     t0 += x * 0x3D1UL; t1 += (x << 6);
     t1 += (t0 >> 26); t0 &= 0x3FFFFFFUL; z0  = t0; z1  = t0 ^ 0x3D0UL;
     t2 += (t1 >> 26); t1 &= 0x3FFFFFFUL; z0 |= t1; z1 &= t1 ^ 0x40UL;
     t3 += (t2 >> 26); t2 &= 0x3FFFFFFUL; z0 |= t2; z1 &= t2;
     t4 += (t3 >> 26); t3 &= 0x3FFFFFFUL; z0 |= t3; z1 &= t3;
     t5 += (t4 >> 26); t4 &= 0x3FFFFFFUL; z0 |= t4; z1 &= t4;
     t6 += (t5 >> 26); t5 &= 0x3FFFFFFUL; z0 |= t5; z1 &= t5;
     t7 += (t6 >> 26); t6 &= 0x3FFFFFFUL; z0 |= t6; z1 &= t6;
     t8 += (t7 >> 26); t7 &= 0x3FFFFFFUL; z0 |= t7; z1 &= t7;
     t9 += (t8 >> 26); t8 &= 0x3FFFFFFUL; z0 |= t8; z1 &= t8;
                                          z0 |= t9; z1 &= t9 ^ 0x3C00000UL;
 
     /* ... except for a possible carry at bit 22 of t9 (i.e. bit 256 of the field element) */
     VERIFY_CHECK(t9 >> 23 == 0);
 
     return (z0 == 0) | (z1 == 0x3FFFFFFUL);
 }
 
 static int secp256k1_fe_normalizes_to_zero_var(secp256k1_fe *r) {
     uint32_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
     uint32_t z0, z1;
     uint32_t x;
 
     t0 = r->n[0];
     t9 = r->n[9];
 
     /* Reduce t9 at the start so there will be at most a single carry from the first pass */
     x = t9 >> 22;
 
     /* The first pass ensures the magnitude is 1, ... */
     t0 += x * 0x3D1UL;
 
     /* z0 tracks a possible raw value of 0, z1 tracks a possible raw value of P */
     z0 = t0 & 0x3FFFFFFUL;
     z1 = z0 ^ 0x3D0UL;
 
     /* Fast return path should catch the majority of cases */
     if ((z0 != 0UL) & (z1 != 0x3FFFFFFUL)) {
         return 0;
     }
 
     t1 = r->n[1];
     t2 = r->n[2];
     t3 = r->n[3];
     t4 = r->n[4];
     t5 = r->n[5];
     t6 = r->n[6];
     t7 = r->n[7];
     t8 = r->n[8];
 
     t9 &= 0x03FFFFFUL;
     t1 += (x << 6);
 
     t1 += (t0 >> 26);
     t2 += (t1 >> 26); t1 &= 0x3FFFFFFUL; z0 |= t1; z1 &= t1 ^ 0x40UL;
     t3 += (t2 >> 26); t2 &= 0x3FFFFFFUL; z0 |= t2; z1 &= t2;
     t4 += (t3 >> 26); t3 &= 0x3FFFFFFUL; z0 |= t3; z1 &= t3;
     t5 += (t4 >> 26); t4 &= 0x3FFFFFFUL; z0 |= t4; z1 &= t4;
     t6 += (t5 >> 26); t5 &= 0x3FFFFFFUL; z0 |= t5; z1 &= t5;
     t7 += (t6 >> 26); t6 &= 0x3FFFFFFUL; z0 |= t6; z1 &= t6;
     t8 += (t7 >> 26); t7 &= 0x3FFFFFFUL; z0 |= t7; z1 &= t7;
     t9 += (t8 >> 26); t8 &= 0x3FFFFFFUL; z0 |= t8; z1 &= t8;
                                          z0 |= t9; z1 &= t9 ^ 0x3C00000UL;
 
     /* ... except for a possible carry at bit 22 of t9 (i.e. bit 256 of the field element) */
     VERIFY_CHECK(t9 >> 23 == 0);
 
     return (z0 == 0) | (z1 == 0x3FFFFFFUL);
 }
 
 SECP256K1_INLINE static void secp256k1_fe_set_int(secp256k1_fe *r, int a) {
     r->n[0] = a;
     r->n[1] = r->n[2] = r->n[3] = r->n[4] = r->n[5] = r->n[6] = r->n[7] = r->n[8] = r->n[9] = 0;
 #ifdef VERIFY
     r->magnitude = 1;
     r->normalized = 1;
     secp256k1_fe_verify(r);
 #endif
 }
 
 SECP256K1_INLINE static int secp256k1_fe_is_zero(const secp256k1_fe *a) {
     const uint32_t *t = a->n;
 #ifdef VERIFY
     VERIFY_CHECK(a->normalized);
     secp256k1_fe_verify(a);
 #endif
     return (t[0] | t[1] | t[2] | t[3] | t[4] | t[5] | t[6] | t[7] | t[8] | t[9]) == 0;
 }
 
 SECP256K1_INLINE static int secp256k1_fe_is_odd(const secp256k1_fe *a) {
 #ifdef VERIFY
     VERIFY_CHECK(a->normalized);
     secp256k1_fe_verify(a);
 #endif
     return a->n[0] & 1;
 }
 
 SECP256K1_INLINE static void secp256k1_fe_clear(secp256k1_fe *a) {
     int i;
 #ifdef VERIFY
     a->magnitude = 0;
     a->normalized = 1;
 #endif
     for (i=0; i<10; i++) {
         a->n[i] = 0;
     }
 }
 
 static int secp256k1_fe_cmp_var(const secp256k1_fe *a, const secp256k1_fe *b) {
     int i;
 #ifdef VERIFY
     VERIFY_CHECK(a->normalized);
     VERIFY_CHECK(b->normalized);
     secp256k1_fe_verify(a);
     secp256k1_fe_verify(b);
 #endif
     for (i = 9; i >= 0; i--) {
         if (a->n[i] > b->n[i]) {
             return 1;
         }
         if (a->n[i] < b->n[i]) {
             return -1;
         }
     }
     return 0;
 }
 
 static int secp256k1_fe_set_b32(secp256k1_fe *r, const unsigned char *a) {
     int ret;
     r->n[0] = (uint32_t)a[31] | ((uint32_t)a[30] << 8) | ((uint32_t)a[29] << 16) | ((uint32_t)(a[28] & 0x3) << 24);
     r->n[1] = (uint32_t)((a[28] >> 2) & 0x3f) | ((uint32_t)a[27] << 6) | ((uint32_t)a[26] << 14) | ((uint32_t)(a[25] & 0xf) << 22);
     r->n[2] = (uint32_t)((a[25] >> 4) & 0xf) | ((uint32_t)a[24] << 4) | ((uint32_t)a[23] << 12) | ((uint32_t)(a[22] & 0x3f) << 20);
     r->n[3] = (uint32_t)((a[22] >> 6) & 0x3) | ((uint32_t)a[21] << 2) | ((uint32_t)a[20] << 10) | ((uint32_t)a[19] << 18);
     r->n[4] = (uint32_t)a[18] | ((uint32_t)a[17] << 8) | ((uint32_t)a[16] << 16) | ((uint32_t)(a[15] & 0x3) << 24);
     r->n[5] = (uint32_t)((a[15] >> 2) & 0x3f) | ((uint32_t)a[14] << 6) | ((uint32_t)a[13] << 14) | ((uint32_t)(a[12] & 0xf) << 22);
     r->n[6] = (uint32_t)((a[12] >> 4) & 0xf) | ((uint32_t)a[11] << 4) | ((uint32_t)a[10] << 12) | ((uint32_t)(a[9] & 0x3f) << 20);
     r->n[7] = (uint32_t)((a[9] >> 6) & 0x3) | ((uint32_t)a[8] << 2) | ((uint32_t)a[7] << 10) | ((uint32_t)a[6] << 18);
     r->n[8] = (uint32_t)a[5] | ((uint32_t)a[4] << 8) | ((uint32_t)a[3] << 16) | ((uint32_t)(a[2] & 0x3) << 24);
     r->n[9] = (uint32_t)((a[2] >> 2) & 0x3f) | ((uint32_t)a[1] << 6) | ((uint32_t)a[0] << 14);
 
     ret = !((r->n[9] == 0x3FFFFFUL) & ((r->n[8] & r->n[7] & r->n[6] & r->n[5] & r->n[4] & r->n[3] & r->n[2]) == 0x3FFFFFFUL) & ((r->n[1] + 0x40UL + ((r->n[0] + 0x3D1UL) >> 26)) > 0x3FFFFFFUL));
 #ifdef VERIFY
     r->magnitude = 1;
     if (ret) {
         r->normalized = 1;
         secp256k1_fe_verify(r);
     } else {
         r->normalized = 0;
     }
 #endif
     return ret;
 }
 
 /** Convert a field element to a 32-byte big endian value. Requires the input to be normalized */
 static void secp256k1_fe_get_b32(unsigned char *r, const secp256k1_fe *a) {
 #ifdef VERIFY
     VERIFY_CHECK(a->normalized);
     secp256k1_fe_verify(a);
 #endif
     r[0] = (a->n[9] >> 14) & 0xff;
     r[1] = (a->n[9] >> 6) & 0xff;
     r[2] = ((a->n[9] & 0x3F) << 2) | ((a->n[8] >> 24) & 0x3);
     r[3] = (a->n[8] >> 16) & 0xff;
     r[4] = (a->n[8] >> 8) & 0xff;
     r[5] = a->n[8] & 0xff;
     r[6] = (a->n[7] >> 18) & 0xff;
     r[7] = (a->n[7] >> 10) & 0xff;
     r[8] = (a->n[7] >> 2) & 0xff;
     r[9] = ((a->n[7] & 0x3) << 6) | ((a->n[6] >> 20) & 0x3f);
     r[10] = (a->n[6] >> 12) & 0xff;
     r[11] = (a->n[6] >> 4) & 0xff;
     r[12] = ((a->n[6] & 0xf) << 4) | ((a->n[5] >> 22) & 0xf);
     r[13] = (a->n[5] >> 14) & 0xff;
     r[14] = (a->n[5] >> 6) & 0xff;
     r[15] = ((a->n[5] & 0x3f) << 2) | ((a->n[4] >> 24) & 0x3);
     r[16] = (a->n[4] >> 16) & 0xff;
     r[17] = (a->n[4] >> 8) & 0xff;
     r[18] = a->n[4] & 0xff;
     r[19] = (a->n[3] >> 18) & 0xff;
     r[20] = (a->n[3] >> 10) & 0xff;
     r[21] = (a->n[3] >> 2) & 0xff;
     r[22] = ((a->n[3] & 0x3) << 6) | ((a->n[2] >> 20) & 0x3f);
     r[23] = (a->n[2] >> 12) & 0xff;
     r[24] = (a->n[2] >> 4) & 0xff;
     r[25] = ((a->n[2] & 0xf) << 4) | ((a->n[1] >> 22) & 0xf);
     r[26] = (a->n[1] >> 14) & 0xff;
     r[27] = (a->n[1] >> 6) & 0xff;
     r[28] = ((a->n[1] & 0x3f) << 2) | ((a->n[0] >> 24) & 0x3);
     r[29] = (a->n[0] >> 16) & 0xff;
     r[30] = (a->n[0] >> 8) & 0xff;
     r[31] = a->n[0] & 0xff;
 }
 
 SECP256K1_INLINE static void secp256k1_fe_negate(secp256k1_fe *r, const secp256k1_fe *a, int m) {
 #ifdef VERIFY
     VERIFY_CHECK(a->magnitude <= m);
     secp256k1_fe_verify(a);
 #endif
     r->n[0] = 0x3FFFC2FUL * 2 * (m + 1) - a->n[0];
     r->n[1] = 0x3FFFFBFUL * 2 * (m + 1) - a->n[1];
     r->n[2] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[2];
     r->n[3] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[3];
     r->n[4] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[4];
     r->n[5] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[5];
     r->n[6] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[6];
     r->n[7] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[7];
     r->n[8] = 0x3FFFFFFUL * 2 * (m + 1) - a->n[8];
     r->n[9] = 0x03FFFFFUL * 2 * (m + 1) - a->n[9];
 #ifdef VERIFY
     r->magnitude = m + 1;
     r->normalized = 0;
     secp256k1_fe_verify(r);
 #endif
 }
 
 SECP256K1_INLINE static void secp256k1_fe_mul_int(secp256k1_fe *r, int a) {
     r->n[0] *= a;
     r->n[1] *= a;
     r->n[2] *= a;
     r->n[3] *= a;
     r->n[4] *= a;
     r->n[5] *= a;
     r->n[6] *= a;
     r->n[7] *= a;
     r->n[8] *= a;
     r->n[9] *= a;
 #ifdef VERIFY
     r->magnitude *= a;
     r->normalized = 0;
     secp256k1_fe_verify(r);
 #endif
 }
 
 SECP256K1_INLINE static void secp256k1_fe_add(secp256k1_fe *r, const secp256k1_fe *a) {
 #ifdef VERIFY
     secp256k1_fe_verify(a);
 #endif
     r->n[0] += a->n[0];
     r->n[1] += a->n[1];
     r->n[2] += a->n[2];
     r->n[3] += a->n[3];
     r->n[4] += a->n[4];
     r->n[5] += a->n[5];
     r->n[6] += a->n[6];
     r->n[7] += a->n[7];
     r->n[8] += a->n[8];
     r->n[9] += a->n[9];
 #ifdef VERIFY
     r->magnitude += a->magnitude;
     r->normalized = 0;
     secp256k1_fe_verify(r);
 #endif
 }
 
 #if defined(USE_EXTERNAL_ASM)
 
 /* External assembler implementation */
 void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t *a, const uint32_t * SECP256K1_RESTRICT b);
 void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t *a);
 
 #else
 
 #ifdef VERIFY
 #define VERIFY_BITS(x, n) VERIFY_CHECK(((x) >> (n)) == 0)
 #else
 #define VERIFY_BITS(x, n) do { } while(0)
 #endif
 
 SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint32_t *r, const uint32_t *a, const uint32_t * SECP256K1_RESTRICT b) {
     uint64_t c, d;
     uint64_t u0, u1, u2, u3, u4, u5, u6, u7, u8;
     uint32_t t9, t1, t0, t2, t3, t4, t5, t6, t7;
     const uint32_t M = 0x3FFFFFFUL, R0 = 0x3D10UL, R1 = 0x400UL;
 
     VERIFY_BITS(a[0], 30);
     VERIFY_BITS(a[1], 30);
     VERIFY_BITS(a[2], 30);
     VERIFY_BITS(a[3], 30);
     VERIFY_BITS(a[4], 30);
     VERIFY_BITS(a[5], 30);
     VERIFY_BITS(a[6], 30);
     VERIFY_BITS(a[7], 30);
     VERIFY_BITS(a[8], 30);
     VERIFY_BITS(a[9], 26);
     VERIFY_BITS(b[0], 30);
     VERIFY_BITS(b[1], 30);
     VERIFY_BITS(b[2], 30);
     VERIFY_BITS(b[3], 30);
     VERIFY_BITS(b[4], 30);
     VERIFY_BITS(b[5], 30);
     VERIFY_BITS(b[6], 30);
     VERIFY_BITS(b[7], 30);
     VERIFY_BITS(b[8], 30);
     VERIFY_BITS(b[9], 26);
 
     /** [... a b c] is a shorthand for ... + a<<52 + b<<26 + c<<0 mod n.
      *  for 0 <= x <= 9, px is a shorthand for sum(a[i]*b[x-i], i=0..x).
      *  for 9 <= x <= 18, px is a shorthand for sum(a[i]*b[x-i], i=(x-9)..9)
      *  Note that [x 0 0 0 0 0 0 0 0 0 0] = [x*R1 x*R0].
      */
 
     d  = (uint64_t)a[0] * b[9]
        + (uint64_t)a[1] * b[8]
        + (uint64_t)a[2] * b[7]
        + (uint64_t)a[3] * b[6]
        + (uint64_t)a[4] * b[5]
        + (uint64_t)a[5] * b[4]
        + (uint64_t)a[6] * b[3]
        + (uint64_t)a[7] * b[2]
        + (uint64_t)a[8] * b[1]
        + (uint64_t)a[9] * b[0];
     /* VERIFY_BITS(d, 64); */
     /* [d 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0] */
     t9 = d & M; d >>= 26;
     VERIFY_BITS(t9, 26);
     VERIFY_BITS(d, 38);
     /* [d t9 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0] */
 
     c  = (uint64_t)a[0] * b[0];
     VERIFY_BITS(c, 60);
     /* [d t9 0 0 0 0 0 0 0 0 c] = [p9 0 0 0 0 0 0 0 0 p0] */
     d += (uint64_t)a[1] * b[9]
        + (uint64_t)a[2] * b[8]
        + (uint64_t)a[3] * b[7]
        + (uint64_t)a[4] * b[6]
        + (uint64_t)a[5] * b[5]
        + (uint64_t)a[6] * b[4]
        + (uint64_t)a[7] * b[3]
        + (uint64_t)a[8] * b[2]
        + (uint64_t)a[9] * b[1];
     VERIFY_BITS(d, 63);
     /* [d t9 0 0 0 0 0 0 0 0 c] = [p10 p9 0 0 0 0 0 0 0 0 p0] */
     u0 = d & M; d >>= 26; c += u0 * R0;
     VERIFY_BITS(u0, 26);
     VERIFY_BITS(d, 37);
     VERIFY_BITS(c, 61);
     /* [d u0 t9 0 0 0 0 0 0 0 0 c-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0] */
     t0 = c & M; c >>= 26; c += u0 * R1;
     VERIFY_BITS(t0, 26);
     VERIFY_BITS(c, 37);
     /* [d u0 t9 0 0 0 0 0 0 0 c-u0*R1 t0-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0] */
     /* [d 0 t9 0 0 0 0 0 0 0 c t0] = [p10 p9 0 0 0 0 0 0 0 0 p0] */
 
     c += (uint64_t)a[0] * b[1]
        + (uint64_t)a[1] * b[0];
     VERIFY_BITS(c, 62);
     /* [d 0 t9 0 0 0 0 0 0 0 c t0] = [p10 p9 0 0 0 0 0 0 0 p1 p0] */
     d += (uint64_t)a[2] * b[9]
        + (uint64_t)a[3] * b[8]
        + (uint64_t)a[4] * b[7]
        + (uint64_t)a[5] * b[6]
        + (uint64_t)a[6] * b[5]
        + (uint64_t)a[7] * b[4]
        + (uint64_t)a[8] * b[3]
        + (uint64_t)a[9] * b[2];
     VERIFY_BITS(d, 63);
     /* [d 0 t9 0 0 0 0 0 0 0 c t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] */
     u1 = d & M; d >>= 26; c += u1 * R0;
     VERIFY_BITS(u1, 26);
     VERIFY_BITS(d, 37);
     VERIFY_BITS(c, 63);
     /* [d u1 0 t9 0 0 0 0 0 0 0 c-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] */
     t1 = c & M; c >>= 26; c += u1 * R1;
     VERIFY_BITS(t1, 26);
     VERIFY_BITS(c, 38);
     /* [d u1 0 t9 0 0 0 0 0 0 c-u1*R1 t1-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] */
     /* [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] */
 
     c += (uint64_t)a[0] * b[2]
        + (uint64_t)a[1] * b[1]
        + (uint64_t)a[2] * b[0];
     VERIFY_BITS(c, 62);
     /* [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] */
     d += (uint64_t)a[3] * b[9]
        + (uint64_t)a[4] * b[8]
        + (uint64_t)a[5] * b[7]
        + (uint64_t)a[6] * b[6]
        + (uint64_t)a[7] * b[5]
        + (uint64_t)a[8] * b[4]
        + (uint64_t)a[9] * b[3];
     VERIFY_BITS(d, 63);
     /* [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] */
     u2 = d & M; d >>= 26; c += u2 * R0;
     VERIFY_BITS(u2, 26);
     VERIFY_BITS(d, 37);
     VERIFY_BITS(c, 63);
     /* [d u2 0 0 t9 0 0 0 0 0 0 c-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] */
     t2 = c & M; c >>= 26; c += u2 * R1;
     VERIFY_BITS(t2, 26);
     VERIFY_BITS(c, 38);
     /* [d u2 0 0 t9 0 0 0 0 0 c-u2*R1 t2-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] */
     /* [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] */
 
     c += (uint64_t)a[0] * b[3]
        + (uint64_t)a[1] * b[2]
        + (uint64_t)a[2] * b[1]
        + (uint64_t)a[3] * b[0];
     VERIFY_BITS(c, 63);
     /* [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] */
     d += (uint64_t)a[4] * b[9]
        + (uint64_t)a[5] * b[8]
        + (uint64_t)a[6] * b[7]
        + (uint64_t)a[7] * b[6]
        + (uint64_t)a[8] * b[5]
        + (uint64_t)a[9] * b[4];
     VERIFY_BITS(d, 63);
     /* [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] */
     u3 = d & M; d >>= 26; c += u3 * R0;
     VERIFY_BITS(u3, 26);
     VERIFY_BITS(d, 37);
     /* VERIFY_BITS(c, 64); */
     /* [d u3 0 0 0 t9 0 0 0 0 0 c-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] */
     t3 = c & M; c >>= 26; c += u3 * R1;
     VERIFY_BITS(t3, 26);
     VERIFY_BITS(c, 39);
     /* [d u3 0 0 0 t9 0 0 0 0 c-u3*R1 t3-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] */
     /* [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] */
 
     c += (uint64_t)a[0] * b[4]
        + (uint64_t)a[1] * b[3]
        + (uint64_t)a[2] * b[2]
        + (uint64_t)a[3] * b[1]
        + (uint64_t)a[4] * b[0];
     VERIFY_BITS(c, 63);
     /* [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] */
     d += (uint64_t)a[5] * b[9]
        + (uint64_t)a[6] * b[8]
        + (uint64_t)a[7] * b[7]
        + (uint64_t)a[8] * b[6]
        + (uint64_t)a[9] * b[5];
     VERIFY_BITS(d, 62);
     /* [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] */
     u4 = d & M; d >>= 26; c += u4 * R0;
     VERIFY_BITS(u4, 26);
     VERIFY_BITS(d, 36);
     /* VERIFY_BITS(c, 64); */
     /* [d u4 0 0 0 0 t9 0 0 0 0 c-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] */
     t4 = c & M; c >>= 26; c += u4 * R1;
     VERIFY_BITS(t4, 26);
     VERIFY_BITS(c, 39);
     /* [d u4 0 0 0 0 t9 0 0 0 c-u4*R1 t4-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] */
     /* [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] */
 
     c += (uint64_t)a[0] * b[5]
        + (uint64_t)a[1] * b[4]
        + (uint64_t)a[2] * b[3]
        + (uint64_t)a[3] * b[2]
        + (uint64_t)a[4] * b[1]
        + (uint64_t)a[5] * b[0];
     VERIFY_BITS(c, 63);
     /* [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] */
     d += (uint64_t)a[6] * b[9]
        + (uint64_t)a[7] * b[8]
        + (uint64_t)a[8] * b[7]
        + (uint64_t)a[9] * b[6];
     VERIFY_BITS(d, 62);
     /* [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] */
     u5 = d & M; d >>= 26; c += u5 * R0;
     VERIFY_BITS(u5, 26);
     VERIFY_BITS(d, 36);
     /* VERIFY_BITS(c, 64); */
     /* [d u5 0 0 0 0 0 t9 0 0 0 c-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] */
     t5 = c & M; c >>= 26; c += u5 * R1;
     VERIFY_BITS(t5, 26);
     VERIFY_BITS(c, 39);
     /* [d u5 0 0 0 0 0 t9 0 0 c-u5*R1 t5-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] */
     /* [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] */
 
     c += (uint64_t)a[0] * b[6]
        + (uint64_t)a[1] * b[5]
        + (uint64_t)a[2] * b[4]
        + (uint64_t)a[3] * b[3]
        + (uint64_t)a[4] * b[2]
        + (uint64_t)a[5] * b[1]
        + (uint64_t)a[6] * b[0];
     VERIFY_BITS(c, 63);
     /* [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] */
     d += (uint64_t)a[7] * b[9]
        + (uint64_t)a[8] * b[8]
        + (uint64_t)a[9] * b[7];
     VERIFY_BITS(d, 61);
     /* [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] */
     u6 = d & M; d >>= 26; c += u6 * R0;
     VERIFY_BITS(u6, 26);
     VERIFY_BITS(d, 35);
     /* VERIFY_BITS(c, 64); */
     /* [d u6 0 0 0 0 0 0 t9 0 0 c-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] */
     t6 = c & M; c >>= 26; c += u6 * R1;
     VERIFY_BITS(t6, 26);
     VERIFY_BITS(c, 39);
     /* [d u6 0 0 0 0 0 0 t9 0 c-u6*R1 t6-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] */
     /* [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] */
 
     c += (uint64_t)a[0] * b[7]
        + (uint64_t)a[1] * b[6]
        + (uint64_t)a[2] * b[5]
        + (uint64_t)a[3] * b[4]
        + (uint64_t)a[4] * b[3]
        + (uint64_t)a[5] * b[2]
        + (uint64_t)a[6] * b[1]
        + (uint64_t)a[7] * b[0];
     /* VERIFY_BITS(c, 64); */
     VERIFY_CHECK(c <= 0x8000007C00000007ULL);
     /* [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] */
     d += (uint64_t)a[8] * b[9]
        + (uint64_t)a[9] * b[8];
     VERIFY_BITS(d, 58);
     /* [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] */
     u7 = d & M; d >>= 26; c += u7 * R0;
     VERIFY_BITS(u7, 26);
     VERIFY_BITS(d, 32);
     /* VERIFY_BITS(c, 64); */
     VERIFY_CHECK(c <= 0x800001703FFFC2F7ULL);
     /* [d u7 0 0 0 0 0 0 0 t9 0 c-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] */
     t7 = c & M; c >>= 26; c += u7 * R1;
     VERIFY_BITS(t7, 26);
     VERIFY_BITS(c, 38);
     /* [d u7 0 0 0 0 0 0 0 t9 c-u7*R1 t7-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] */
     /* [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] */
 
     c += (uint64_t)a[0] * b[8]
        + (uint64_t)a[1] * b[7]
        + (uint64_t)a[2] * b[6]
        + (uint64_t)a[3] * b[5]
        + (uint64_t)a[4] * b[4]
        + (uint64_t)a[5] * b[3]
        + (uint64_t)a[6] * b[2]
        + (uint64_t)a[7] * b[1]
        + (uint64_t)a[8] * b[0];
     /* VERIFY_BITS(c, 64); */
     VERIFY_CHECK(c <= 0x9000007B80000008ULL);
     /* [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     d += (uint64_t)a[9] * b[9];
     VERIFY_BITS(d, 57);
     /* [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     u8 = d & M; d >>= 26; c += u8 * R0;
     VERIFY_BITS(u8, 26);
     VERIFY_BITS(d, 31);
     /* VERIFY_BITS(c, 64); */
     VERIFY_CHECK(c <= 0x9000016FBFFFC2F8ULL);
     /* [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 t4 t3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
 
     r[3] = t3;
     VERIFY_BITS(r[3], 26);
     /* [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 t4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     r[4] = t4;
     VERIFY_BITS(r[4], 26);
     /* [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     r[5] = t5;
     VERIFY_BITS(r[5], 26);
     /* [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     r[6] = t6;
     VERIFY_BITS(r[6], 26);
     /* [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     r[7] = t7;
     VERIFY_BITS(r[7], 26);
     /* [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
 
     r[8] = c & M; c >>= 26; c += u8 * R1;
     VERIFY_BITS(r[8], 26);
     VERIFY_BITS(c, 39);
     /* [d u8 0 0 0 0 0 0 0 0 t9+c-u8*R1 r8-u8*R0 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     /* [d 0 0 0 0 0 0 0 0 0 t9+c r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     c   += d * R0 + t9;
     VERIFY_BITS(c, 45);
     /* [d 0 0 0 0 0 0 0 0 0 c-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     r[9] = c & (M >> 4); c >>= 22; c += d * (R1 << 4);
     VERIFY_BITS(r[9], 22);
     VERIFY_BITS(c, 46);
     /* [d 0 0 0 0 0 0 0 0 r9+((c-d*R1<<4)<<22)-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     /* [d 0 0 0 0 0 0 0 -d*R1 r9+(c<<22)-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     /* [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
 
     d    = c * (R0 >> 4) + t0;
     VERIFY_BITS(d, 56);
     /* [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1 d-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     r[0] = d & M; d >>= 26;
     VERIFY_BITS(r[0], 26);
     VERIFY_BITS(d, 30);
     /* [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1+d r0-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     d   += c * (R1 >> 4) + t1;
     VERIFY_BITS(d, 53);
     VERIFY_CHECK(d <= 0x10000003FFFFBFULL);
     /* [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 d-c*R1>>4 r0-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     /* [r9 r8 r7 r6 r5 r4 r3 t2 d r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     r[1] = d & M; d >>= 26;
     VERIFY_BITS(r[1], 26);
     VERIFY_BITS(d, 27);
     VERIFY_CHECK(d <= 0x4000000ULL);
     /* [r9 r8 r7 r6 r5 r4 r3 t2+d r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     d   += t2;
     VERIFY_BITS(d, 27);
     /* [r9 r8 r7 r6 r5 r4 r3 d r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     r[2] = d;
     VERIFY_BITS(r[2], 27);
     /* [r9 r8 r7 r6 r5 r4 r3 r2 r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
 }
 
 SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint32_t *r, const uint32_t *a) {
     uint64_t c, d;
     uint64_t u0, u1, u2, u3, u4, u5, u6, u7, u8;
     uint32_t t9, t0, t1, t2, t3, t4, t5, t6, t7;
     const uint32_t M = 0x3FFFFFFUL, R0 = 0x3D10UL, R1 = 0x400UL;
 
     VERIFY_BITS(a[0], 30);
     VERIFY_BITS(a[1], 30);
     VERIFY_BITS(a[2], 30);
     VERIFY_BITS(a[3], 30);
     VERIFY_BITS(a[4], 30);
     VERIFY_BITS(a[5], 30);
     VERIFY_BITS(a[6], 30);
     VERIFY_BITS(a[7], 30);
     VERIFY_BITS(a[8], 30);
     VERIFY_BITS(a[9], 26);
 
     /** [... a b c] is a shorthand for ... + a<<52 + b<<26 + c<<0 mod n.
      *  px is a shorthand for sum(a[i]*a[x-i], i=0..x).
      *  Note that [x 0 0 0 0 0 0 0 0 0 0] = [x*R1 x*R0].
      */
 
     d  = (uint64_t)(a[0]*2) * a[9]
        + (uint64_t)(a[1]*2) * a[8]
        + (uint64_t)(a[2]*2) * a[7]
        + (uint64_t)(a[3]*2) * a[6]
        + (uint64_t)(a[4]*2) * a[5];
     /* VERIFY_BITS(d, 64); */
     /* [d 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0] */
     t9 = d & M; d >>= 26;
     VERIFY_BITS(t9, 26);
     VERIFY_BITS(d, 38);
     /* [d t9 0 0 0 0 0 0 0 0 0] = [p9 0 0 0 0 0 0 0 0 0] */
 
     c  = (uint64_t)a[0] * a[0];
     VERIFY_BITS(c, 60);
     /* [d t9 0 0 0 0 0 0 0 0 c] = [p9 0 0 0 0 0 0 0 0 p0] */
     d += (uint64_t)(a[1]*2) * a[9]
        + (uint64_t)(a[2]*2) * a[8]
        + (uint64_t)(a[3]*2) * a[7]
        + (uint64_t)(a[4]*2) * a[6]
        + (uint64_t)a[5] * a[5];
     VERIFY_BITS(d, 63);
     /* [d t9 0 0 0 0 0 0 0 0 c] = [p10 p9 0 0 0 0 0 0 0 0 p0] */
     u0 = d & M; d >>= 26; c += u0 * R0;
     VERIFY_BITS(u0, 26);
     VERIFY_BITS(d, 37);
     VERIFY_BITS(c, 61);
     /* [d u0 t9 0 0 0 0 0 0 0 0 c-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0] */
     t0 = c & M; c >>= 26; c += u0 * R1;
     VERIFY_BITS(t0, 26);
     VERIFY_BITS(c, 37);
     /* [d u0 t9 0 0 0 0 0 0 0 c-u0*R1 t0-u0*R0] = [p10 p9 0 0 0 0 0 0 0 0 p0] */
     /* [d 0 t9 0 0 0 0 0 0 0 c t0] = [p10 p9 0 0 0 0 0 0 0 0 p0] */
 
     c += (uint64_t)(a[0]*2) * a[1];
     VERIFY_BITS(c, 62);
     /* [d 0 t9 0 0 0 0 0 0 0 c t0] = [p10 p9 0 0 0 0 0 0 0 p1 p0] */
     d += (uint64_t)(a[2]*2) * a[9]
        + (uint64_t)(a[3]*2) * a[8]
        + (uint64_t)(a[4]*2) * a[7]
        + (uint64_t)(a[5]*2) * a[6];
     VERIFY_BITS(d, 63);
     /* [d 0 t9 0 0 0 0 0 0 0 c t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] */
     u1 = d & M; d >>= 26; c += u1 * R0;
     VERIFY_BITS(u1, 26);
     VERIFY_BITS(d, 37);
     VERIFY_BITS(c, 63);
     /* [d u1 0 t9 0 0 0 0 0 0 0 c-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] */
     t1 = c & M; c >>= 26; c += u1 * R1;
     VERIFY_BITS(t1, 26);
     VERIFY_BITS(c, 38);
     /* [d u1 0 t9 0 0 0 0 0 0 c-u1*R1 t1-u1*R0 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] */
     /* [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p11 p10 p9 0 0 0 0 0 0 0 p1 p0] */
 
     c += (uint64_t)(a[0]*2) * a[2]
        + (uint64_t)a[1] * a[1];
     VERIFY_BITS(c, 62);
     /* [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] */
     d += (uint64_t)(a[3]*2) * a[9]
        + (uint64_t)(a[4]*2) * a[8]
        + (uint64_t)(a[5]*2) * a[7]
        + (uint64_t)a[6] * a[6];
     VERIFY_BITS(d, 63);
     /* [d 0 0 t9 0 0 0 0 0 0 c t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] */
     u2 = d & M; d >>= 26; c += u2 * R0;
     VERIFY_BITS(u2, 26);
     VERIFY_BITS(d, 37);
     VERIFY_BITS(c, 63);
     /* [d u2 0 0 t9 0 0 0 0 0 0 c-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] */
     t2 = c & M; c >>= 26; c += u2 * R1;
     VERIFY_BITS(t2, 26);
     VERIFY_BITS(c, 38);
     /* [d u2 0 0 t9 0 0 0 0 0 c-u2*R1 t2-u2*R0 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] */
     /* [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 0 p2 p1 p0] */
 
     c += (uint64_t)(a[0]*2) * a[3]
        + (uint64_t)(a[1]*2) * a[2];
     VERIFY_BITS(c, 63);
     /* [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] */
     d += (uint64_t)(a[4]*2) * a[9]
        + (uint64_t)(a[5]*2) * a[8]
        + (uint64_t)(a[6]*2) * a[7];
     VERIFY_BITS(d, 63);
     /* [d 0 0 0 t9 0 0 0 0 0 c t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] */
     u3 = d & M; d >>= 26; c += u3 * R0;
     VERIFY_BITS(u3, 26);
     VERIFY_BITS(d, 37);
     /* VERIFY_BITS(c, 64); */
     /* [d u3 0 0 0 t9 0 0 0 0 0 c-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] */
     t3 = c & M; c >>= 26; c += u3 * R1;
     VERIFY_BITS(t3, 26);
     VERIFY_BITS(c, 39);
     /* [d u3 0 0 0 t9 0 0 0 0 c-u3*R1 t3-u3*R0 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] */
     /* [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 0 p3 p2 p1 p0] */
 
     c += (uint64_t)(a[0]*2) * a[4]
        + (uint64_t)(a[1]*2) * a[3]
        + (uint64_t)a[2] * a[2];
     VERIFY_BITS(c, 63);
     /* [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] */
     d += (uint64_t)(a[5]*2) * a[9]
        + (uint64_t)(a[6]*2) * a[8]
        + (uint64_t)a[7] * a[7];
     VERIFY_BITS(d, 62);
     /* [d 0 0 0 0 t9 0 0 0 0 c t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] */
     u4 = d & M; d >>= 26; c += u4 * R0;
     VERIFY_BITS(u4, 26);
     VERIFY_BITS(d, 36);
     /* VERIFY_BITS(c, 64); */
     /* [d u4 0 0 0 0 t9 0 0 0 0 c-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] */
     t4 = c & M; c >>= 26; c += u4 * R1;
     VERIFY_BITS(t4, 26);
     VERIFY_BITS(c, 39);
     /* [d u4 0 0 0 0 t9 0 0 0 c-u4*R1 t4-u4*R0 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] */
     /* [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 0 p4 p3 p2 p1 p0] */
 
     c += (uint64_t)(a[0]*2) * a[5]
        + (uint64_t)(a[1]*2) * a[4]
        + (uint64_t)(a[2]*2) * a[3];
     VERIFY_BITS(c, 63);
     /* [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] */
     d += (uint64_t)(a[6]*2) * a[9]
        + (uint64_t)(a[7]*2) * a[8];
     VERIFY_BITS(d, 62);
     /* [d 0 0 0 0 0 t9 0 0 0 c t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] */
     u5 = d & M; d >>= 26; c += u5 * R0;
     VERIFY_BITS(u5, 26);
     VERIFY_BITS(d, 36);
     /* VERIFY_BITS(c, 64); */
     /* [d u5 0 0 0 0 0 t9 0 0 0 c-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] */
     t5 = c & M; c >>= 26; c += u5 * R1;
     VERIFY_BITS(t5, 26);
     VERIFY_BITS(c, 39);
     /* [d u5 0 0 0 0 0 t9 0 0 c-u5*R1 t5-u5*R0 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] */
     /* [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 0 p5 p4 p3 p2 p1 p0] */
 
     c += (uint64_t)(a[0]*2) * a[6]
        + (uint64_t)(a[1]*2) * a[5]
        + (uint64_t)(a[2]*2) * a[4]
        + (uint64_t)a[3] * a[3];
     VERIFY_BITS(c, 63);
     /* [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] */
     d += (uint64_t)(a[7]*2) * a[9]
        + (uint64_t)a[8] * a[8];
     VERIFY_BITS(d, 61);
     /* [d 0 0 0 0 0 0 t9 0 0 c t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] */
     u6 = d & M; d >>= 26; c += u6 * R0;
     VERIFY_BITS(u6, 26);
     VERIFY_BITS(d, 35);
     /* VERIFY_BITS(c, 64); */
     /* [d u6 0 0 0 0 0 0 t9 0 0 c-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] */
     t6 = c & M; c >>= 26; c += u6 * R1;
     VERIFY_BITS(t6, 26);
     VERIFY_BITS(c, 39);
     /* [d u6 0 0 0 0 0 0 t9 0 c-u6*R1 t6-u6*R0 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] */
     /* [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 0 p6 p5 p4 p3 p2 p1 p0] */
 
     c += (uint64_t)(a[0]*2) * a[7]
        + (uint64_t)(a[1]*2) * a[6]
        + (uint64_t)(a[2]*2) * a[5]
        + (uint64_t)(a[3]*2) * a[4];
     /* VERIFY_BITS(c, 64); */
     VERIFY_CHECK(c <= 0x8000007C00000007ULL);
     /* [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] */
     d += (uint64_t)(a[8]*2) * a[9];
     VERIFY_BITS(d, 58);
     /* [d 0 0 0 0 0 0 0 t9 0 c t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] */
     u7 = d & M; d >>= 26; c += u7 * R0;
     VERIFY_BITS(u7, 26);
     VERIFY_BITS(d, 32);
     /* VERIFY_BITS(c, 64); */
     VERIFY_CHECK(c <= 0x800001703FFFC2F7ULL);
     /* [d u7 0 0 0 0 0 0 0 t9 0 c-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] */
     t7 = c & M; c >>= 26; c += u7 * R1;
     VERIFY_BITS(t7, 26);
     VERIFY_BITS(c, 38);
     /* [d u7 0 0 0 0 0 0 0 t9 c-u7*R1 t7-u7*R0 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] */
     /* [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 0 p7 p6 p5 p4 p3 p2 p1 p0] */
 
     c += (uint64_t)(a[0]*2) * a[8]
        + (uint64_t)(a[1]*2) * a[7]
        + (uint64_t)(a[2]*2) * a[6]
        + (uint64_t)(a[3]*2) * a[5]
        + (uint64_t)a[4] * a[4];
     /* VERIFY_BITS(c, 64); */
     VERIFY_CHECK(c <= 0x9000007B80000008ULL);
     /* [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     d += (uint64_t)a[9] * a[9];
     VERIFY_BITS(d, 57);
     /* [d 0 0 0 0 0 0 0 0 t9 c t7 t6 t5 t4 t3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     u8 = d & M; d >>= 26; c += u8 * R0;
     VERIFY_BITS(u8, 26);
     VERIFY_BITS(d, 31);
     /* VERIFY_BITS(c, 64); */
     VERIFY_CHECK(c <= 0x9000016FBFFFC2F8ULL);
     /* [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 t4 t3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
 
     r[3] = t3;
     VERIFY_BITS(r[3], 26);
     /* [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 t4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     r[4] = t4;
     VERIFY_BITS(r[4], 26);
     /* [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 t5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     r[5] = t5;
     VERIFY_BITS(r[5], 26);
     /* [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 t6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     r[6] = t6;
     VERIFY_BITS(r[6], 26);
     /* [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 t7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     r[7] = t7;
     VERIFY_BITS(r[7], 26);
     /* [d u8 0 0 0 0 0 0 0 0 t9 c-u8*R0 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
 
     r[8] = c & M; c >>= 26; c += u8 * R1;
     VERIFY_BITS(r[8], 26);
     VERIFY_BITS(c, 39);
     /* [d u8 0 0 0 0 0 0 0 0 t9+c-u8*R1 r8-u8*R0 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     /* [d 0 0 0 0 0 0 0 0 0 t9+c r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     c   += d * R0 + t9;
     VERIFY_BITS(c, 45);
     /* [d 0 0 0 0 0 0 0 0 0 c-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     r[9] = c & (M >> 4); c >>= 22; c += d * (R1 << 4);
     VERIFY_BITS(r[9], 22);
     VERIFY_BITS(c, 46);
     /* [d 0 0 0 0 0 0 0 0 r9+((c-d*R1<<4)<<22)-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     /* [d 0 0 0 0 0 0 0 -d*R1 r9+(c<<22)-d*R0 r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     /* [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1 t0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
 
     d    = c * (R0 >> 4) + t0;
     VERIFY_BITS(d, 56);
     /* [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1 d-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     r[0] = d & M; d >>= 26;
     VERIFY_BITS(r[0], 26);
     VERIFY_BITS(d, 30);
     /* [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 t1+d r0-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     d   += c * (R1 >> 4) + t1;
     VERIFY_BITS(d, 53);
     VERIFY_CHECK(d <= 0x10000003FFFFBFULL);
     /* [r9+(c<<22) r8 r7 r6 r5 r4 r3 t2 d-c*R1>>4 r0-c*R0>>4] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     /* [r9 r8 r7 r6 r5 r4 r3 t2 d r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     r[1] = d & M; d >>= 26;
     VERIFY_BITS(r[1], 26);
     VERIFY_BITS(d, 27);
     VERIFY_CHECK(d <= 0x4000000ULL);
     /* [r9 r8 r7 r6 r5 r4 r3 t2+d r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     d   += t2;
     VERIFY_BITS(d, 27);
     /* [r9 r8 r7 r6 r5 r4 r3 d r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
     r[2] = d;
     VERIFY_BITS(r[2], 27);
     /* [r9 r8 r7 r6 r5 r4 r3 r2 r1 r0] = [p18 p17 p16 p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0] */
 }
 #endif
 
 static void secp256k1_fe_mul(secp256k1_fe *r, const secp256k1_fe *a, const secp256k1_fe * SECP256K1_RESTRICT b) {
 #ifdef VERIFY
     VERIFY_CHECK(a->magnitude <= 8);
     VERIFY_CHECK(b->magnitude <= 8);
     secp256k1_fe_verify(a);
     secp256k1_fe_verify(b);
     VERIFY_CHECK(r != b);
     VERIFY_CHECK(a != b);
 #endif
     secp256k1_fe_mul_inner(r->n, a->n, b->n);
 #ifdef VERIFY
     r->magnitude = 1;
     r->normalized = 0;
     secp256k1_fe_verify(r);
 #endif
 }
 
 static void secp256k1_fe_sqr(secp256k1_fe *r, const secp256k1_fe *a) {
 #ifdef VERIFY
     VERIFY_CHECK(a->magnitude <= 8);
     secp256k1_fe_verify(a);
 #endif
     secp256k1_fe_sqr_inner(r->n, a->n);
 #ifdef VERIFY
     r->magnitude = 1;
     r->normalized = 0;
     secp256k1_fe_verify(r);
 #endif
 }
 
 static SECP256K1_INLINE void secp256k1_fe_cmov(secp256k1_fe *r, const secp256k1_fe *a, int flag) {
     uint32_t mask0, mask1;
     VG_CHECK_VERIFY(r->n, sizeof(r->n));
     mask0 = flag + ~((uint32_t)0);
     mask1 = ~mask0;
     r->n[0] = (r->n[0] & mask0) | (a->n[0] & mask1);
     r->n[1] = (r->n[1] & mask0) | (a->n[1] & mask1);
     r->n[2] = (r->n[2] & mask0) | (a->n[2] & mask1);
     r->n[3] = (r->n[3] & mask0) | (a->n[3] & mask1);
     r->n[4] = (r->n[4] & mask0) | (a->n[4] & mask1);
     r->n[5] = (r->n[5] & mask0) | (a->n[5] & mask1);
     r->n[6] = (r->n[6] & mask0) | (a->n[6] & mask1);
     r->n[7] = (r->n[7] & mask0) | (a->n[7] & mask1);
     r->n[8] = (r->n[8] & mask0) | (a->n[8] & mask1);
     r->n[9] = (r->n[9] & mask0) | (a->n[9] & mask1);
 #ifdef VERIFY
     if (flag) {
         r->magnitude = a->magnitude;
         r->normalized = a->normalized;
     }
 #endif
 }
 
 static SECP256K1_INLINE void secp256k1_fe_storage_cmov(secp256k1_fe_storage *r, const secp256k1_fe_storage *a, int flag) {
     uint32_t mask0, mask1;
     VG_CHECK_VERIFY(r->n, sizeof(r->n));
     mask0 = flag + ~((uint32_t)0);
     mask1 = ~mask0;
     r->n[0] = (r->n[0] & mask0) | (a->n[0] & mask1);
     r->n[1] = (r->n[1] & mask0) | (a->n[1] & mask1);
     r->n[2] = (r->n[2] & mask0) | (a->n[2] & mask1);
     r->n[3] = (r->n[3] & mask0) | (a->n[3] & mask1);
     r->n[4] = (r->n[4] & mask0) | (a->n[4] & mask1);
     r->n[5] = (r->n[5] & mask0) | (a->n[5] & mask1);
     r->n[6] = (r->n[6] & mask0) | (a->n[6] & mask1);
     r->n[7] = (r->n[7] & mask0) | (a->n[7] & mask1);
 }
 
 static void secp256k1_fe_to_storage(secp256k1_fe_storage *r, const secp256k1_fe *a) {
 #ifdef VERIFY
     VERIFY_CHECK(a->normalized);
 #endif
     r->n[0] = a->n[0] | a->n[1] << 26;
     r->n[1] = a->n[1] >> 6 | a->n[2] << 20;
     r->n[2] = a->n[2] >> 12 | a->n[3] << 14;
     r->n[3] = a->n[3] >> 18 | a->n[4] << 8;
     r->n[4] = a->n[4] >> 24 | a->n[5] << 2 | a->n[6] << 28;
     r->n[5] = a->n[6] >> 4 | a->n[7] << 22;
     r->n[6] = a->n[7] >> 10 | a->n[8] << 16;
     r->n[7] = a->n[8] >> 16 | a->n[9] << 10;
 }
 
 static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const secp256k1_fe_storage *a) {
     r->n[0] = a->n[0] & 0x3FFFFFFUL;
     r->n[1] = a->n[0] >> 26 | ((a->n[1] << 6) & 0x3FFFFFFUL);
     r->n[2] = a->n[1] >> 20 | ((a->n[2] << 12) & 0x3FFFFFFUL);
     r->n[3] = a->n[2] >> 14 | ((a->n[3] << 18) & 0x3FFFFFFUL);
     r->n[4] = a->n[3] >> 8 | ((a->n[4] << 24) & 0x3FFFFFFUL);
     r->n[5] = (a->n[4] >> 2) & 0x3FFFFFFUL;
     r->n[6] = a->n[4] >> 28 | ((a->n[5] << 4) & 0x3FFFFFFUL);
     r->n[7] = a->n[5] >> 22 | ((a->n[6] << 10) & 0x3FFFFFFUL);
     r->n[8] = a->n[6] >> 16 | ((a->n[7] << 16) & 0x3FFFFFFUL);
     r->n[9] = a->n[7] >> 10;
 #ifdef VERIFY
     r->magnitude = 1;
     r->normalized = 1;
 #endif
 }
 
+static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
+    secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
+    int j;
+
+    /** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
+     *  { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
+     *  [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
+     */
+
+    secp256k1_fe_sqr(&x2, a);
+    secp256k1_fe_mul(&x2, &x2, a);
+
+    secp256k1_fe_sqr(&x3, &x2);
+    secp256k1_fe_mul(&x3, &x3, a);
+
+    x6 = x3;
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&x6, &x6);
+    }
+    secp256k1_fe_mul(&x6, &x6, &x3);
+
+    x9 = x6;
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&x9, &x9);
+    }
+    secp256k1_fe_mul(&x9, &x9, &x3);
+
+    x11 = x9;
+    for (j=0; j<2; j++) {
+        secp256k1_fe_sqr(&x11, &x11);
+    }
+    secp256k1_fe_mul(&x11, &x11, &x2);
+
+    x22 = x11;
+    for (j=0; j<11; j++) {
+        secp256k1_fe_sqr(&x22, &x22);
+    }
+    secp256k1_fe_mul(&x22, &x22, &x11);
+
+    x44 = x22;
+    for (j=0; j<22; j++) {
+        secp256k1_fe_sqr(&x44, &x44);
+    }
+    secp256k1_fe_mul(&x44, &x44, &x22);
+
+    x88 = x44;
+    for (j=0; j<44; j++) {
+        secp256k1_fe_sqr(&x88, &x88);
+    }
+    secp256k1_fe_mul(&x88, &x88, &x44);
+
+    x176 = x88;
+    for (j=0; j<88; j++) {
+        secp256k1_fe_sqr(&x176, &x176);
+    }
+    secp256k1_fe_mul(&x176, &x176, &x88);
+
+    x220 = x176;
+    for (j=0; j<44; j++) {
+        secp256k1_fe_sqr(&x220, &x220);
+    }
+    secp256k1_fe_mul(&x220, &x220, &x44);
+
+    x223 = x220;
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&x223, &x223);
+    }
+    secp256k1_fe_mul(&x223, &x223, &x3);
+
+    /* The final result is then assembled using a sliding window over the blocks. */
+
+    t1 = x223;
+    for (j=0; j<23; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(&t1, &t1, &x22);
+    for (j=0; j<5; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(&t1, &t1, a);
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(&t1, &t1, &x2);
+    for (j=0; j<2; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(r, a, &t1);
+}
+
+static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
+#if defined(USE_FIELD_INV_BUILTIN)
+    secp256k1_fe_inv(r, a);
+#elif defined(USE_FIELD_INV_NUM)
+    secp256k1_num n, m;
+    static const secp256k1_fe negone = SECP256K1_FE_CONST(
+        0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL,
+        0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFEUL, 0xFFFFFC2EUL
+    );
+    /* secp256k1 field prime, value p defined in "Standards for Efficient Cryptography" (SEC2) 2.7.1. */
+    static const unsigned char prime[32] = {
+        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+        0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F
+    };
+    unsigned char b[32];
+    int res;
+    secp256k1_fe c = *a;
+    secp256k1_fe_normalize_var(&c);
+    secp256k1_fe_get_b32(b, &c);
+    secp256k1_num_set_bin(&n, b, 32);
+    secp256k1_num_set_bin(&m, prime, 32);
+    secp256k1_num_mod_inverse(&n, &n, &m);
+    secp256k1_num_get_bin(b, 32, &n);
+    res = secp256k1_fe_set_b32(r, b);
+    (void)res;
+    VERIFY_CHECK(res);
+    /* Verify the result is the (unique) valid inverse using non-GMP code. */
+    secp256k1_fe_mul(&c, &c, r);
+    secp256k1_fe_add(&c, &negone);
+    CHECK(secp256k1_fe_normalizes_to_zero_var(&c));
+#else
+#error "Please select field inverse implementation"
+#endif
+}
+
 #endif /* SECP256K1_FIELD_REPR_IMPL_H */
diff --git a/src/secp256k1/src/field_5x52_impl.h b/src/secp256k1/src/field_5x52_impl.h
index 3465ea324..b56456749 100644
--- a/src/secp256k1/src/field_5x52_impl.h
+++ b/src/secp256k1/src/field_5x52_impl.h
@@ -1,501 +1,628 @@
 /***********************************************************************
  * Copyright (c) 2013, 2014 Pieter Wuille                              *
  * Distributed under the MIT software license, see the accompanying    *
  * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
  ***********************************************************************/
 
 #ifndef SECP256K1_FIELD_REPR_IMPL_H
 #define SECP256K1_FIELD_REPR_IMPL_H
 
 #if defined HAVE_CONFIG_H
 #include "libsecp256k1-config.h"
 #endif
 
 #include "util.h"
 #include "field.h"
 
 #if defined(USE_ASM_X86_64)
 #include "field_5x52_asm_impl.h"
 #else
 #include "field_5x52_int128_impl.h"
 #endif
 
 /** Implements arithmetic modulo FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFF FFFFFFFE FFFFFC2F,
  *  represented as 5 uint64_t's in base 2^52. The values are allowed to contain >52 each. In particular,
  *  each FieldElem has a 'magnitude' associated with it. Internally, a magnitude M means each element
  *  is at most M*(2^53-1), except the most significant one, which is limited to M*(2^49-1). All operations
  *  accept any input with magnitude at most M, and have different rules for propagating magnitude to their
  *  output.
  */
 
 #ifdef VERIFY
 static void secp256k1_fe_verify(const secp256k1_fe *a) {
     const uint64_t *d = a->n;
     int m = a->normalized ? 1 : 2 * a->magnitude, r = 1;
    /* secp256k1 'p' value defined in "Standards for Efficient Cryptography" (SEC2) 2.7.1. */
     r &= (d[0] <= 0xFFFFFFFFFFFFFULL * m);
     r &= (d[1] <= 0xFFFFFFFFFFFFFULL * m);
     r &= (d[2] <= 0xFFFFFFFFFFFFFULL * m);
     r &= (d[3] <= 0xFFFFFFFFFFFFFULL * m);
     r &= (d[4] <= 0x0FFFFFFFFFFFFULL * m);
     r &= (a->magnitude >= 0);
     r &= (a->magnitude <= 2048);
     if (a->normalized) {
         r &= (a->magnitude <= 1);
         if (r && (d[4] == 0x0FFFFFFFFFFFFULL) && ((d[3] & d[2] & d[1]) == 0xFFFFFFFFFFFFFULL)) {
             r &= (d[0] < 0xFFFFEFFFFFC2FULL);
         }
     }
     VERIFY_CHECK(r == 1);
 }
 #endif
 
 static void secp256k1_fe_normalize(secp256k1_fe *r) {
     uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4];
 
     /* Reduce t4 at the start so there will be at most a single carry from the first pass */
     uint64_t m;
     uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL;
 
     /* The first pass ensures the magnitude is 1, ... */
     t0 += x * 0x1000003D1ULL;
     t1 += (t0 >> 52); t0 &= 0xFFFFFFFFFFFFFULL;
     t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL; m = t1;
     t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL; m &= t2;
     t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL; m &= t3;
 
     /* ... except for a possible carry at bit 48 of t4 (i.e. bit 256 of the field element) */
     VERIFY_CHECK(t4 >> 49 == 0);
 
     /* At most a single final reduction is needed; check if the value is >= the field characteristic */
     x = (t4 >> 48) | ((t4 == 0x0FFFFFFFFFFFFULL) & (m == 0xFFFFFFFFFFFFFULL)
         & (t0 >= 0xFFFFEFFFFFC2FULL));
 
     /* Apply the final reduction (for constant-time behaviour, we do it always) */
     t0 += x * 0x1000003D1ULL;
     t1 += (t0 >> 52); t0 &= 0xFFFFFFFFFFFFFULL;
     t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL;
     t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL;
     t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL;
 
     /* If t4 didn't carry to bit 48 already, then it should have after any final reduction */
     VERIFY_CHECK(t4 >> 48 == x);
 
     /* Mask off the possible multiple of 2^256 from the final reduction */
     t4 &= 0x0FFFFFFFFFFFFULL;
 
     r->n[0] = t0; r->n[1] = t1; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4;
 
 #ifdef VERIFY
     r->magnitude = 1;
     r->normalized = 1;
     secp256k1_fe_verify(r);
 #endif
 }
 
 static void secp256k1_fe_normalize_weak(secp256k1_fe *r) {
     uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4];
 
     /* Reduce t4 at the start so there will be at most a single carry from the first pass */
     uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL;
 
     /* The first pass ensures the magnitude is 1, ... */
     t0 += x * 0x1000003D1ULL;
     t1 += (t0 >> 52); t0 &= 0xFFFFFFFFFFFFFULL;
     t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL;
     t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL;
     t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL;
 
     /* ... except for a possible carry at bit 48 of t4 (i.e. bit 256 of the field element) */
     VERIFY_CHECK(t4 >> 49 == 0);
 
     r->n[0] = t0; r->n[1] = t1; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4;
 
 #ifdef VERIFY
     r->magnitude = 1;
     secp256k1_fe_verify(r);
 #endif
 }
 
 static void secp256k1_fe_normalize_var(secp256k1_fe *r) {
     uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4];
 
     /* Reduce t4 at the start so there will be at most a single carry from the first pass */
     uint64_t m;
     uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL;
 
     /* The first pass ensures the magnitude is 1, ... */
     t0 += x * 0x1000003D1ULL;
     t1 += (t0 >> 52); t0 &= 0xFFFFFFFFFFFFFULL;
     t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL; m = t1;
     t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL; m &= t2;
     t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL; m &= t3;
 
     /* ... except for a possible carry at bit 48 of t4 (i.e. bit 256 of the field element) */
     VERIFY_CHECK(t4 >> 49 == 0);
 
     /* At most a single final reduction is needed; check if the value is >= the field characteristic */
     x = (t4 >> 48) | ((t4 == 0x0FFFFFFFFFFFFULL) & (m == 0xFFFFFFFFFFFFFULL)
         & (t0 >= 0xFFFFEFFFFFC2FULL));
 
     if (x) {
         t0 += 0x1000003D1ULL;
         t1 += (t0 >> 52); t0 &= 0xFFFFFFFFFFFFFULL;
         t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL;
         t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL;
         t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL;
 
         /* If t4 didn't carry to bit 48 already, then it should have after any final reduction */
         VERIFY_CHECK(t4 >> 48 == x);
 
         /* Mask off the possible multiple of 2^256 from the final reduction */
         t4 &= 0x0FFFFFFFFFFFFULL;
     }
 
     r->n[0] = t0; r->n[1] = t1; r->n[2] = t2; r->n[3] = t3; r->n[4] = t4;
 
 #ifdef VERIFY
     r->magnitude = 1;
     r->normalized = 1;
     secp256k1_fe_verify(r);
 #endif
 }
 
 static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r) {
     uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4];
 
     /* z0 tracks a possible raw value of 0, z1 tracks a possible raw value of P */
     uint64_t z0, z1;
 
     /* Reduce t4 at the start so there will be at most a single carry from the first pass */
     uint64_t x = t4 >> 48; t4 &= 0x0FFFFFFFFFFFFULL;
 
     /* The first pass ensures the magnitude is 1, ... */
     t0 += x * 0x1000003D1ULL;
     t1 += (t0 >> 52); t0 &= 0xFFFFFFFFFFFFFULL; z0  = t0; z1  = t0 ^ 0x1000003D0ULL;
     t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL; z0 |= t1; z1 &= t1;
     t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL; z0 |= t2; z1 &= t2;
     t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL; z0 |= t3; z1 &= t3;
                                                 z0 |= t4; z1 &= t4 ^ 0xF000000000000ULL;
 
     /* ... except for a possible carry at bit 48 of t4 (i.e. bit 256 of the field element) */
     VERIFY_CHECK(t4 >> 49 == 0);
 
     return (z0 == 0) | (z1 == 0xFFFFFFFFFFFFFULL);
 }
 
 static int secp256k1_fe_normalizes_to_zero_var(secp256k1_fe *r) {
     uint64_t t0, t1, t2, t3, t4;
     uint64_t z0, z1;
     uint64_t x;
 
     t0 = r->n[0];
     t4 = r->n[4];
 
     /* Reduce t4 at the start so there will be at most a single carry from the first pass */
     x = t4 >> 48;
 
     /* The first pass ensures the magnitude is 1, ... */
     t0 += x * 0x1000003D1ULL;
 
     /* z0 tracks a possible raw value of 0, z1 tracks a possible raw value of P */
     z0 = t0 & 0xFFFFFFFFFFFFFULL;
     z1 = z0 ^ 0x1000003D0ULL;
 
     /* Fast return path should catch the majority of cases */
     if ((z0 != 0ULL) & (z1 != 0xFFFFFFFFFFFFFULL)) {
         return 0;
     }
 
     t1 = r->n[1];
     t2 = r->n[2];
     t3 = r->n[3];
 
     t4 &= 0x0FFFFFFFFFFFFULL;
 
     t1 += (t0 >> 52);
     t2 += (t1 >> 52); t1 &= 0xFFFFFFFFFFFFFULL; z0 |= t1; z1 &= t1;
     t3 += (t2 >> 52); t2 &= 0xFFFFFFFFFFFFFULL; z0 |= t2; z1 &= t2;
     t4 += (t3 >> 52); t3 &= 0xFFFFFFFFFFFFFULL; z0 |= t3; z1 &= t3;
                                                 z0 |= t4; z1 &= t4 ^ 0xF000000000000ULL;
 
     /* ... except for a possible carry at bit 48 of t4 (i.e. bit 256 of the field element) */
     VERIFY_CHECK(t4 >> 49 == 0);
 
     return (z0 == 0) | (z1 == 0xFFFFFFFFFFFFFULL);
 }
 
 SECP256K1_INLINE static void secp256k1_fe_set_int(secp256k1_fe *r, int a) {
     r->n[0] = a;
     r->n[1] = r->n[2] = r->n[3] = r->n[4] = 0;
 #ifdef VERIFY
     r->magnitude = 1;
     r->normalized = 1;
     secp256k1_fe_verify(r);
 #endif
 }
 
 SECP256K1_INLINE static int secp256k1_fe_is_zero(const secp256k1_fe *a) {
     const uint64_t *t = a->n;
 #ifdef VERIFY
     VERIFY_CHECK(a->normalized);
     secp256k1_fe_verify(a);
 #endif
     return (t[0] | t[1] | t[2] | t[3] | t[4]) == 0;
 }
 
 SECP256K1_INLINE static int secp256k1_fe_is_odd(const secp256k1_fe *a) {
 #ifdef VERIFY
     VERIFY_CHECK(a->normalized);
     secp256k1_fe_verify(a);
 #endif
     return a->n[0] & 1;
 }
 
 SECP256K1_INLINE static void secp256k1_fe_clear(secp256k1_fe *a) {
     int i;
 #ifdef VERIFY
     a->magnitude = 0;
     a->normalized = 1;
 #endif
     for (i=0; i<5; i++) {
         a->n[i] = 0;
     }
 }
 
 static int secp256k1_fe_cmp_var(const secp256k1_fe *a, const secp256k1_fe *b) {
     int i;
 #ifdef VERIFY
     VERIFY_CHECK(a->normalized);
     VERIFY_CHECK(b->normalized);
     secp256k1_fe_verify(a);
     secp256k1_fe_verify(b);
 #endif
     for (i = 4; i >= 0; i--) {
         if (a->n[i] > b->n[i]) {
             return 1;
         }
         if (a->n[i] < b->n[i]) {
             return -1;
         }
     }
     return 0;
 }
 
 static int secp256k1_fe_set_b32(secp256k1_fe *r, const unsigned char *a) {
     int ret;
     r->n[0] = (uint64_t)a[31]
             | ((uint64_t)a[30] << 8)
             | ((uint64_t)a[29] << 16)
             | ((uint64_t)a[28] << 24)
             | ((uint64_t)a[27] << 32)
             | ((uint64_t)a[26] << 40)
             | ((uint64_t)(a[25] & 0xF)  << 48);
     r->n[1] = (uint64_t)((a[25] >> 4) & 0xF)
             | ((uint64_t)a[24] << 4)
             | ((uint64_t)a[23] << 12)
             | ((uint64_t)a[22] << 20)
             | ((uint64_t)a[21] << 28)
             | ((uint64_t)a[20] << 36)
             | ((uint64_t)a[19] << 44);
     r->n[2] = (uint64_t)a[18]
             | ((uint64_t)a[17] << 8)
             | ((uint64_t)a[16] << 16)
             | ((uint64_t)a[15] << 24)
             | ((uint64_t)a[14] << 32)
             | ((uint64_t)a[13] << 40)
             | ((uint64_t)(a[12] & 0xF) << 48);
     r->n[3] = (uint64_t)((a[12] >> 4) & 0xF)
             | ((uint64_t)a[11] << 4)
             | ((uint64_t)a[10] << 12)
             | ((uint64_t)a[9]  << 20)
             | ((uint64_t)a[8]  << 28)
             | ((uint64_t)a[7]  << 36)
             | ((uint64_t)a[6]  << 44);
     r->n[4] = (uint64_t)a[5]
             | ((uint64_t)a[4] << 8)
             | ((uint64_t)a[3] << 16)
             | ((uint64_t)a[2] << 24)
             | ((uint64_t)a[1] << 32)
             | ((uint64_t)a[0] << 40);
     ret = !((r->n[4] == 0x0FFFFFFFFFFFFULL) & ((r->n[3] & r->n[2] & r->n[1]) == 0xFFFFFFFFFFFFFULL) & (r->n[0] >= 0xFFFFEFFFFFC2FULL));
 #ifdef VERIFY
     r->magnitude = 1;
     if (ret) {
         r->normalized = 1;
         secp256k1_fe_verify(r);
     } else {
         r->normalized = 0;
     }
 #endif
     return ret;
 }
 
 /** Convert a field element to a 32-byte big endian value. Requires the input to be normalized */
 static void secp256k1_fe_get_b32(unsigned char *r, const secp256k1_fe *a) {
 #ifdef VERIFY
     VERIFY_CHECK(a->normalized);
     secp256k1_fe_verify(a);
 #endif
     r[0] = (a->n[4] >> 40) & 0xFF;
     r[1] = (a->n[4] >> 32) & 0xFF;
     r[2] = (a->n[4] >> 24) & 0xFF;
     r[3] = (a->n[4] >> 16) & 0xFF;
     r[4] = (a->n[4] >> 8) & 0xFF;
     r[5] = a->n[4] & 0xFF;
     r[6] = (a->n[3] >> 44) & 0xFF;
     r[7] = (a->n[3] >> 36) & 0xFF;
     r[8] = (a->n[3] >> 28) & 0xFF;
     r[9] = (a->n[3] >> 20) & 0xFF;
     r[10] = (a->n[3] >> 12) & 0xFF;
     r[11] = (a->n[3] >> 4) & 0xFF;
     r[12] = ((a->n[2] >> 48) & 0xF) | ((a->n[3] & 0xF) << 4);
     r[13] = (a->n[2] >> 40) & 0xFF;
     r[14] = (a->n[2] >> 32) & 0xFF;
     r[15] = (a->n[2] >> 24) & 0xFF;
     r[16] = (a->n[2] >> 16) & 0xFF;
     r[17] = (a->n[2] >> 8) & 0xFF;
     r[18] = a->n[2] & 0xFF;
     r[19] = (a->n[1] >> 44) & 0xFF;
     r[20] = (a->n[1] >> 36) & 0xFF;
     r[21] = (a->n[1] >> 28) & 0xFF;
     r[22] = (a->n[1] >> 20) & 0xFF;
     r[23] = (a->n[1] >> 12) & 0xFF;
     r[24] = (a->n[1] >> 4) & 0xFF;
     r[25] = ((a->n[0] >> 48) & 0xF) | ((a->n[1] & 0xF) << 4);
     r[26] = (a->n[0] >> 40) & 0xFF;
     r[27] = (a->n[0] >> 32) & 0xFF;
     r[28] = (a->n[0] >> 24) & 0xFF;
     r[29] = (a->n[0] >> 16) & 0xFF;
     r[30] = (a->n[0] >> 8) & 0xFF;
     r[31] = a->n[0] & 0xFF;
 }
 
 SECP256K1_INLINE static void secp256k1_fe_negate(secp256k1_fe *r, const secp256k1_fe *a, int m) {
 #ifdef VERIFY
     VERIFY_CHECK(a->magnitude <= m);
     secp256k1_fe_verify(a);
 #endif
     r->n[0] = 0xFFFFEFFFFFC2FULL * 2 * (m + 1) - a->n[0];
     r->n[1] = 0xFFFFFFFFFFFFFULL * 2 * (m + 1) - a->n[1];
     r->n[2] = 0xFFFFFFFFFFFFFULL * 2 * (m + 1) - a->n[2];
     r->n[3] = 0xFFFFFFFFFFFFFULL * 2 * (m + 1) - a->n[3];
     r->n[4] = 0x0FFFFFFFFFFFFULL * 2 * (m + 1) - a->n[4];
 #ifdef VERIFY
     r->magnitude = m + 1;
     r->normalized = 0;
     secp256k1_fe_verify(r);
 #endif
 }
 
 SECP256K1_INLINE static void secp256k1_fe_mul_int(secp256k1_fe *r, int a) {
     r->n[0] *= a;
     r->n[1] *= a;
     r->n[2] *= a;
     r->n[3] *= a;
     r->n[4] *= a;
 #ifdef VERIFY
     r->magnitude *= a;
     r->normalized = 0;
     secp256k1_fe_verify(r);
 #endif
 }
 
 SECP256K1_INLINE static void secp256k1_fe_add(secp256k1_fe *r, const secp256k1_fe *a) {
 #ifdef VERIFY
     secp256k1_fe_verify(a);
 #endif
     r->n[0] += a->n[0];
     r->n[1] += a->n[1];
     r->n[2] += a->n[2];
     r->n[3] += a->n[3];
     r->n[4] += a->n[4];
 #ifdef VERIFY
     r->magnitude += a->magnitude;
     r->normalized = 0;
     secp256k1_fe_verify(r);
 #endif
 }
 
 static void secp256k1_fe_mul(secp256k1_fe *r, const secp256k1_fe *a, const secp256k1_fe * SECP256K1_RESTRICT b) {
 #ifdef VERIFY
     VERIFY_CHECK(a->magnitude <= 8);
     VERIFY_CHECK(b->magnitude <= 8);
     secp256k1_fe_verify(a);
     secp256k1_fe_verify(b);
     VERIFY_CHECK(r != b);
     VERIFY_CHECK(a != b);
 #endif
     secp256k1_fe_mul_inner(r->n, a->n, b->n);
 #ifdef VERIFY
     r->magnitude = 1;
     r->normalized = 0;
     secp256k1_fe_verify(r);
 #endif
 }
 
 static void secp256k1_fe_sqr(secp256k1_fe *r, const secp256k1_fe *a) {
 #ifdef VERIFY
     VERIFY_CHECK(a->magnitude <= 8);
     secp256k1_fe_verify(a);
 #endif
     secp256k1_fe_sqr_inner(r->n, a->n);
 #ifdef VERIFY
     r->magnitude = 1;
     r->normalized = 0;
     secp256k1_fe_verify(r);
 #endif
 }
 
 static SECP256K1_INLINE void secp256k1_fe_cmov(secp256k1_fe *r, const secp256k1_fe *a, int flag) {
     uint64_t mask0, mask1;
     VG_CHECK_VERIFY(r->n, sizeof(r->n));
     mask0 = flag + ~((uint64_t)0);
     mask1 = ~mask0;
     r->n[0] = (r->n[0] & mask0) | (a->n[0] & mask1);
     r->n[1] = (r->n[1] & mask0) | (a->n[1] & mask1);
     r->n[2] = (r->n[2] & mask0) | (a->n[2] & mask1);
     r->n[3] = (r->n[3] & mask0) | (a->n[3] & mask1);
     r->n[4] = (r->n[4] & mask0) | (a->n[4] & mask1);
 #ifdef VERIFY
     if (flag) {
         r->magnitude = a->magnitude;
         r->normalized = a->normalized;
     }
 #endif
 }
 
 static SECP256K1_INLINE void secp256k1_fe_storage_cmov(secp256k1_fe_storage *r, const secp256k1_fe_storage *a, int flag) {
     uint64_t mask0, mask1;
     VG_CHECK_VERIFY(r->n, sizeof(r->n));
     mask0 = flag + ~((uint64_t)0);
     mask1 = ~mask0;
     r->n[0] = (r->n[0] & mask0) | (a->n[0] & mask1);
     r->n[1] = (r->n[1] & mask0) | (a->n[1] & mask1);
     r->n[2] = (r->n[2] & mask0) | (a->n[2] & mask1);
     r->n[3] = (r->n[3] & mask0) | (a->n[3] & mask1);
 }
 
 static void secp256k1_fe_to_storage(secp256k1_fe_storage *r, const secp256k1_fe *a) {
 #ifdef VERIFY
     VERIFY_CHECK(a->normalized);
 #endif
     r->n[0] = a->n[0] | a->n[1] << 52;
     r->n[1] = a->n[1] >> 12 | a->n[2] << 40;
     r->n[2] = a->n[2] >> 24 | a->n[3] << 28;
     r->n[3] = a->n[3] >> 36 | a->n[4] << 16;
 }
 
 static SECP256K1_INLINE void secp256k1_fe_from_storage(secp256k1_fe *r, const secp256k1_fe_storage *a) {
     r->n[0] = a->n[0] & 0xFFFFFFFFFFFFFULL;
     r->n[1] = a->n[0] >> 52 | ((a->n[1] << 12) & 0xFFFFFFFFFFFFFULL);
     r->n[2] = a->n[1] >> 40 | ((a->n[2] << 24) & 0xFFFFFFFFFFFFFULL);
     r->n[3] = a->n[2] >> 28 | ((a->n[3] << 36) & 0xFFFFFFFFFFFFFULL);
     r->n[4] = a->n[3] >> 16;
 #ifdef VERIFY
     r->magnitude = 1;
     r->normalized = 1;
 #endif
 }
 
+static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
+    secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
+    int j;
+
+    /** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
+     *  { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
+     *  [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
+     */
+
+    secp256k1_fe_sqr(&x2, a);
+    secp256k1_fe_mul(&x2, &x2, a);
+
+    secp256k1_fe_sqr(&x3, &x2);
+    secp256k1_fe_mul(&x3, &x3, a);
+
+    x6 = x3;
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&x6, &x6);
+    }
+    secp256k1_fe_mul(&x6, &x6, &x3);
+
+    x9 = x6;
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&x9, &x9);
+    }
+    secp256k1_fe_mul(&x9, &x9, &x3);
+
+    x11 = x9;
+    for (j=0; j<2; j++) {
+        secp256k1_fe_sqr(&x11, &x11);
+    }
+    secp256k1_fe_mul(&x11, &x11, &x2);
+
+    x22 = x11;
+    for (j=0; j<11; j++) {
+        secp256k1_fe_sqr(&x22, &x22);
+    }
+    secp256k1_fe_mul(&x22, &x22, &x11);
+
+    x44 = x22;
+    for (j=0; j<22; j++) {
+        secp256k1_fe_sqr(&x44, &x44);
+    }
+    secp256k1_fe_mul(&x44, &x44, &x22);
+
+    x88 = x44;
+    for (j=0; j<44; j++) {
+        secp256k1_fe_sqr(&x88, &x88);
+    }
+    secp256k1_fe_mul(&x88, &x88, &x44);
+
+    x176 = x88;
+    for (j=0; j<88; j++) {
+        secp256k1_fe_sqr(&x176, &x176);
+    }
+    secp256k1_fe_mul(&x176, &x176, &x88);
+
+    x220 = x176;
+    for (j=0; j<44; j++) {
+        secp256k1_fe_sqr(&x220, &x220);
+    }
+    secp256k1_fe_mul(&x220, &x220, &x44);
+
+    x223 = x220;
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&x223, &x223);
+    }
+    secp256k1_fe_mul(&x223, &x223, &x3);
+
+    /* The final result is then assembled using a sliding window over the blocks. */
+
+    t1 = x223;
+    for (j=0; j<23; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(&t1, &t1, &x22);
+    for (j=0; j<5; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(&t1, &t1, a);
+    for (j=0; j<3; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(&t1, &t1, &x2);
+    for (j=0; j<2; j++) {
+        secp256k1_fe_sqr(&t1, &t1);
+    }
+    secp256k1_fe_mul(r, a, &t1);
+}
+
+static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
+#if defined(USE_FIELD_INV_BUILTIN)
+    secp256k1_fe_inv(r, a);
+#elif defined(USE_FIELD_INV_NUM)
+    secp256k1_num n, m;
+    static const secp256k1_fe negone = SECP256K1_FE_CONST(
+        0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL,
+        0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFEUL, 0xFFFFFC2EUL
+    );
+    /* secp256k1 field prime, value p defined in "Standards for Efficient Cryptography" (SEC2) 2.7.1. */
+    static const unsigned char prime[32] = {
+        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
+        0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F
+    };
+    unsigned char b[32];
+    int res;
+    secp256k1_fe c = *a;
+    secp256k1_fe_normalize_var(&c);
+    secp256k1_fe_get_b32(b, &c);
+    secp256k1_num_set_bin(&n, b, 32);
+    secp256k1_num_set_bin(&m, prime, 32);
+    secp256k1_num_mod_inverse(&n, &n, &m);
+    secp256k1_num_get_bin(b, 32, &n);
+    res = secp256k1_fe_set_b32(r, b);
+    (void)res;
+    VERIFY_CHECK(res);
+    /* Verify the result is the (unique) valid inverse using non-GMP code. */
+    secp256k1_fe_mul(&c, &c, r);
+    secp256k1_fe_add(&c, &negone);
+    CHECK(secp256k1_fe_normalizes_to_zero_var(&c));
+#else
+#error "Please select field inverse implementation"
+#endif
+}
+
 #endif /* SECP256K1_FIELD_REPR_IMPL_H */
diff --git a/src/secp256k1/src/field_impl.h b/src/secp256k1/src/field_impl.h
index f0096f631..7b75e9860 100644
--- a/src/secp256k1/src/field_impl.h
+++ b/src/secp256k1/src/field_impl.h
@@ -1,293 +1,166 @@
 /***********************************************************************
  * Copyright (c) 2013, 2014 Pieter Wuille                              *
  * Distributed under the MIT software license, see the accompanying    *
  * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
  ***********************************************************************/
 
 #ifndef SECP256K1_FIELD_IMPL_H
 #define SECP256K1_FIELD_IMPL_H
 
 #if defined HAVE_CONFIG_H
 #include "libsecp256k1-config.h"
 #endif
 
 #include "util.h"
 #include "num.h"
 
 #if defined(SECP256K1_WIDEMUL_INT128)
 #include "field_5x52_impl.h"
 #elif defined(SECP256K1_WIDEMUL_INT64)
 #include "field_10x26_impl.h"
 #else
 #error "Please select wide multiplication implementation"
 #endif
 
 SECP256K1_INLINE static int secp256k1_fe_equal(const secp256k1_fe *a, const secp256k1_fe *b) {
     secp256k1_fe na;
     secp256k1_fe_negate(&na, a, 1);
     secp256k1_fe_add(&na, b);
     return secp256k1_fe_normalizes_to_zero(&na);
 }
 
 SECP256K1_INLINE static int secp256k1_fe_equal_var(const secp256k1_fe *a, const secp256k1_fe *b) {
     secp256k1_fe na;
     secp256k1_fe_negate(&na, a, 1);
     secp256k1_fe_add(&na, b);
     return secp256k1_fe_normalizes_to_zero_var(&na);
 }
 
 static int secp256k1_fe_sqrt(secp256k1_fe *r, const secp256k1_fe *a) {
     /** Given that p is congruent to 3 mod 4, we can compute the square root of
      *  a mod p as the (p+1)/4'th power of a.
      *
      *  As (p+1)/4 is an even number, it will have the same result for a and for
      *  (-a). Only one of these two numbers actually has a square root however,
      *  so we test at the end by squaring and comparing to the input.
      *  Also because (p+1)/4 is an even number, the computed square root is
      *  itself always a square (a ** ((p+1)/4) is the square of a ** ((p+1)/8)).
      */
     secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
     int j;
 
     VERIFY_CHECK(r != a);
 
     /** The binary representation of (p + 1)/4 has 3 blocks of 1s, with lengths in
      *  { 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
      *  1, [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
      */
 
     secp256k1_fe_sqr(&x2, a);
     secp256k1_fe_mul(&x2, &x2, a);
 
     secp256k1_fe_sqr(&x3, &x2);
     secp256k1_fe_mul(&x3, &x3, a);
 
     x6 = x3;
     for (j=0; j<3; j++) {
         secp256k1_fe_sqr(&x6, &x6);
     }
     secp256k1_fe_mul(&x6, &x6, &x3);
 
     x9 = x6;
     for (j=0; j<3; j++) {
         secp256k1_fe_sqr(&x9, &x9);
     }
     secp256k1_fe_mul(&x9, &x9, &x3);
 
     x11 = x9;
     for (j=0; j<2; j++) {
         secp256k1_fe_sqr(&x11, &x11);
     }
     secp256k1_fe_mul(&x11, &x11, &x2);
 
     x22 = x11;
     for (j=0; j<11; j++) {
         secp256k1_fe_sqr(&x22, &x22);
     }
     secp256k1_fe_mul(&x22, &x22, &x11);
 
     x44 = x22;
     for (j=0; j<22; j++) {
         secp256k1_fe_sqr(&x44, &x44);
     }
     secp256k1_fe_mul(&x44, &x44, &x22);
 
     x88 = x44;
     for (j=0; j<44; j++) {
         secp256k1_fe_sqr(&x88, &x88);
     }
     secp256k1_fe_mul(&x88, &x88, &x44);
 
     x176 = x88;
     for (j=0; j<88; j++) {
         secp256k1_fe_sqr(&x176, &x176);
     }
     secp256k1_fe_mul(&x176, &x176, &x88);
 
     x220 = x176;
     for (j=0; j<44; j++) {
         secp256k1_fe_sqr(&x220, &x220);
     }
     secp256k1_fe_mul(&x220, &x220, &x44);
 
     x223 = x220;
     for (j=0; j<3; j++) {
         secp256k1_fe_sqr(&x223, &x223);
     }
     secp256k1_fe_mul(&x223, &x223, &x3);
 
     /* The final result is then assembled using a sliding window over the blocks. */
 
     t1 = x223;
     for (j=0; j<23; j++) {
         secp256k1_fe_sqr(&t1, &t1);
     }
     secp256k1_fe_mul(&t1, &t1, &x22);
     for (j=0; j<6; j++) {
         secp256k1_fe_sqr(&t1, &t1);
     }
     secp256k1_fe_mul(&t1, &t1, &x2);
     secp256k1_fe_sqr(&t1, &t1);
     secp256k1_fe_sqr(r, &t1);
 
     /* Check that a square root was actually calculated */
 
     secp256k1_fe_sqr(&t1, r);
     return secp256k1_fe_equal(&t1, a);
 }
 
-static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
-    secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
-    int j;
-
-    /** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
-     *  { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
-     *  [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
-     */
-
-    secp256k1_fe_sqr(&x2, a);
-    secp256k1_fe_mul(&x2, &x2, a);
-
-    secp256k1_fe_sqr(&x3, &x2);
-    secp256k1_fe_mul(&x3, &x3, a);
-
-    x6 = x3;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x6, &x6);
-    }
-    secp256k1_fe_mul(&x6, &x6, &x3);
-
-    x9 = x6;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x9, &x9);
-    }
-    secp256k1_fe_mul(&x9, &x9, &x3);
-
-    x11 = x9;
-    for (j=0; j<2; j++) {
-        secp256k1_fe_sqr(&x11, &x11);
-    }
-    secp256k1_fe_mul(&x11, &x11, &x2);
-
-    x22 = x11;
-    for (j=0; j<11; j++) {
-        secp256k1_fe_sqr(&x22, &x22);
-    }
-    secp256k1_fe_mul(&x22, &x22, &x11);
-
-    x44 = x22;
-    for (j=0; j<22; j++) {
-        secp256k1_fe_sqr(&x44, &x44);
-    }
-    secp256k1_fe_mul(&x44, &x44, &x22);
-
-    x88 = x44;
-    for (j=0; j<44; j++) {
-        secp256k1_fe_sqr(&x88, &x88);
-    }
-    secp256k1_fe_mul(&x88, &x88, &x44);
-
-    x176 = x88;
-    for (j=0; j<88; j++) {
-        secp256k1_fe_sqr(&x176, &x176);
-    }
-    secp256k1_fe_mul(&x176, &x176, &x88);
-
-    x220 = x176;
-    for (j=0; j<44; j++) {
-        secp256k1_fe_sqr(&x220, &x220);
-    }
-    secp256k1_fe_mul(&x220, &x220, &x44);
-
-    x223 = x220;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x223, &x223);
-    }
-    secp256k1_fe_mul(&x223, &x223, &x3);
-
-    /* The final result is then assembled using a sliding window over the blocks. */
-
-    t1 = x223;
-    for (j=0; j<23; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, &x22);
-    for (j=0; j<5; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, a);
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, &x2);
-    for (j=0; j<2; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(r, a, &t1);
-}
-
-static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
-#if defined(USE_FIELD_INV_BUILTIN)
-    secp256k1_fe_inv(r, a);
-#elif defined(USE_FIELD_INV_NUM)
-    secp256k1_num n, m;
-    static const secp256k1_fe negone = SECP256K1_FE_CONST(
-        0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL,
-        0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFEUL, 0xFFFFFC2EUL
-    );
-    /* secp256k1 field prime, value p defined in "Standards for Efficient Cryptography" (SEC2) 2.7.1. */
-    static const unsigned char prime[32] = {
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
-        0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F
-    };
-    unsigned char b[32];
-    int res;
-    secp256k1_fe c = *a;
-    secp256k1_fe_normalize_var(&c);
-    secp256k1_fe_get_b32(b, &c);
-    secp256k1_num_set_bin(&n, b, 32);
-    secp256k1_num_set_bin(&m, prime, 32);
-    secp256k1_num_mod_inverse(&n, &n, &m);
-    secp256k1_num_get_bin(b, 32, &n);
-    res = secp256k1_fe_set_b32(r, b);
-    (void)res;
-    VERIFY_CHECK(res);
-    /* Verify the result is the (unique) valid inverse using non-GMP code. */
-    secp256k1_fe_mul(&c, &c, r);
-    secp256k1_fe_add(&c, &negone);
-    CHECK(secp256k1_fe_normalizes_to_zero_var(&c));
-#else
-#error "Please select field inverse implementation"
-#endif
-}
-
 static int secp256k1_fe_is_quad_var(const secp256k1_fe *a) {
 #ifndef USE_NUM_NONE
     unsigned char b[32];
     secp256k1_num n;
     secp256k1_num m;
     /* secp256k1 field prime, value p defined in "Standards for Efficient Cryptography" (SEC2) 2.7.1. */
     static const unsigned char prime[32] = {
         0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
         0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
         0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
         0xFF,0xFF,0xFF,0xFE,0xFF,0xFF,0xFC,0x2F
     };
 
     secp256k1_fe c = *a;
     secp256k1_fe_normalize_var(&c);
     secp256k1_fe_get_b32(b, &c);
     secp256k1_num_set_bin(&n, b, 32);
     secp256k1_num_set_bin(&m, prime, 32);
     return secp256k1_num_jacobi(&n, &m) >= 0;
 #else
     secp256k1_fe r;
     return secp256k1_fe_sqrt(&r, a);
 #endif
 }
 
 static const secp256k1_fe secp256k1_fe_one = SECP256K1_FE_CONST(0, 0, 0, 0, 0, 0, 0, 1);
 
 #endif /* SECP256K1_FIELD_IMPL_H */
diff --git a/src/secp256k1/src/scalar_4x64_impl.h b/src/secp256k1/src/scalar_4x64_impl.h
index 3eaa0418c..6ba38e25e 100644
--- a/src/secp256k1/src/scalar_4x64_impl.h
+++ b/src/secp256k1/src/scalar_4x64_impl.h
@@ -1,958 +1,1137 @@
 /***********************************************************************
  * Copyright (c) 2013, 2014 Pieter Wuille                              *
  * Distributed under the MIT software license, see the accompanying    *
  * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
  ***********************************************************************/
 
 #ifndef SECP256K1_SCALAR_REPR_IMPL_H
 #define SECP256K1_SCALAR_REPR_IMPL_H
 
 /* Limbs of the secp256k1 order. */
 #define SECP256K1_N_0 ((uint64_t)0xBFD25E8CD0364141ULL)
 #define SECP256K1_N_1 ((uint64_t)0xBAAEDCE6AF48A03BULL)
 #define SECP256K1_N_2 ((uint64_t)0xFFFFFFFFFFFFFFFEULL)
 #define SECP256K1_N_3 ((uint64_t)0xFFFFFFFFFFFFFFFFULL)
 
 /* Limbs of 2^256 minus the secp256k1 order. */
 #define SECP256K1_N_C_0 (~SECP256K1_N_0 + 1)
 #define SECP256K1_N_C_1 (~SECP256K1_N_1)
 #define SECP256K1_N_C_2 (1)
 
 /* Limbs of half the secp256k1 order. */
 #define SECP256K1_N_H_0 ((uint64_t)0xDFE92F46681B20A0ULL)
 #define SECP256K1_N_H_1 ((uint64_t)0x5D576E7357A4501DULL)
 #define SECP256K1_N_H_2 ((uint64_t)0xFFFFFFFFFFFFFFFFULL)
 #define SECP256K1_N_H_3 ((uint64_t)0x7FFFFFFFFFFFFFFFULL)
 
 SECP256K1_INLINE static void secp256k1_scalar_clear(secp256k1_scalar *r) {
     r->d[0] = 0;
     r->d[1] = 0;
     r->d[2] = 0;
     r->d[3] = 0;
 }
 
 SECP256K1_INLINE static void secp256k1_scalar_set_int(secp256k1_scalar *r, unsigned int v) {
     r->d[0] = v;
     r->d[1] = 0;
     r->d[2] = 0;
     r->d[3] = 0;
 }
 
 SECP256K1_INLINE static unsigned int secp256k1_scalar_get_bits(const secp256k1_scalar *a, unsigned int offset, unsigned int count) {
     VERIFY_CHECK((offset + count - 1) >> 6 == offset >> 6);
     return (a->d[offset >> 6] >> (offset & 0x3F)) & ((((uint64_t)1) << count) - 1);
 }
 
 SECP256K1_INLINE static unsigned int secp256k1_scalar_get_bits_var(const secp256k1_scalar *a, unsigned int offset, unsigned int count) {
     VERIFY_CHECK(count < 32);
     VERIFY_CHECK(offset + count <= 256);
     if ((offset + count - 1) >> 6 == offset >> 6) {
         return secp256k1_scalar_get_bits(a, offset, count);
     } else {
         VERIFY_CHECK((offset >> 6) + 1 < 4);
         return ((a->d[offset >> 6] >> (offset & 0x3F)) | (a->d[(offset >> 6) + 1] << (64 - (offset & 0x3F)))) & ((((uint64_t)1) << count) - 1);
     }
 }
 
 SECP256K1_INLINE static int secp256k1_scalar_check_overflow(const secp256k1_scalar *a) {
     int yes = 0;
     int no = 0;
     no |= (a->d[3] < SECP256K1_N_3); /* No need for a > check. */
     no |= (a->d[2] < SECP256K1_N_2);
     yes |= (a->d[2] > SECP256K1_N_2) & ~no;
     no |= (a->d[1] < SECP256K1_N_1);
     yes |= (a->d[1] > SECP256K1_N_1) & ~no;
     yes |= (a->d[0] >= SECP256K1_N_0) & ~no;
     return yes;
 }
 
 SECP256K1_INLINE static int secp256k1_scalar_reduce(secp256k1_scalar *r, unsigned int overflow) {
     uint128_t t;
     VERIFY_CHECK(overflow <= 1);
     t = (uint128_t)r->d[0] + overflow * SECP256K1_N_C_0;
     r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
     t += (uint128_t)r->d[1] + overflow * SECP256K1_N_C_1;
     r->d[1] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
     t += (uint128_t)r->d[2] + overflow * SECP256K1_N_C_2;
     r->d[2] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
     t += (uint64_t)r->d[3];
     r->d[3] = t & 0xFFFFFFFFFFFFFFFFULL;
     return overflow;
 }
 
 static int secp256k1_scalar_add(secp256k1_scalar *r, const secp256k1_scalar *a, const secp256k1_scalar *b) {
     int overflow;
     uint128_t t = (uint128_t)a->d[0] + b->d[0];
     r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
     t += (uint128_t)a->d[1] + b->d[1];
     r->d[1] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
     t += (uint128_t)a->d[2] + b->d[2];
     r->d[2] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
     t += (uint128_t)a->d[3] + b->d[3];
     r->d[3] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
     overflow = t + secp256k1_scalar_check_overflow(r);
     VERIFY_CHECK(overflow == 0 || overflow == 1);
     secp256k1_scalar_reduce(r, overflow);
     return overflow;
 }
 
 static void secp256k1_scalar_cadd_bit(secp256k1_scalar *r, unsigned int bit, int flag) {
     uint128_t t;
     VERIFY_CHECK(bit < 256);
     bit += ((uint32_t) flag - 1) & 0x100;  /* forcing (bit >> 6) > 3 makes this a noop */
     t = (uint128_t)r->d[0] + (((uint64_t)((bit >> 6) == 0)) << (bit & 0x3F));
     r->d[0] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
     t += (uint128_t)r->d[1] + (((uint64_t)((bit >> 6) == 1)) << (bit & 0x3F));
     r->d[1] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
     t += (uint128_t)r->d[2] + (((uint64_t)((bit >> 6) == 2)) << (bit & 0x3F));
     r->d[2] = t & 0xFFFFFFFFFFFFFFFFULL; t >>= 64;
     t += (uint128_t)r->d[3] + (((uint64_t)((bit >> 6) == 3)) << (bit & 0x3F));
     r->d[3] = t & 0xFFFFFFFFFFFFFFFFULL;
 #ifdef VERIFY
     VERIFY_CHECK((t >> 64) == 0);
     VERIFY_CHECK(secp256k1_scalar_check_overflow(r) == 0);
 #endif
 }
 
 static void secp256k1_scalar_set_b32(secp256k1_scalar *r, const unsigned char *b32, int *overflow) {
     int over;
     r->d[0] = (uint64_t)b32[31] | (uint64_t)b32[30] << 8 | (uint64_t)b32[29] << 16 | (uint64_t)b32[28] << 24 | (uint64_t)b32[27] << 32 | (uint64_t)b32[26] << 40 | (uint64_t)b32[25] << 48 | (uint64_t)b32[24] << 56;
     r->d[1] = (uint64_t)b32[23] | (uint64_t)b32[22] << 8 | (uint64_t)b32[21] << 16 | (uint64_t)b32[20] << 24 | (uint64_t)b32[19] << 32 | (uint64_t)b32[18] << 40 | (uint64_t)b32[17] << 48 | (uint64_t)b32[16] << 56;
     r->d[2] = (uint64_t)b32[15] | (uint64_t)b32[14] << 8 | (uint64_t)b32[13] << 16 | (uint64_t)b32[12] << 24 | (uint64_t)b32[11] << 32 | (uint64_t)b32[10] << 40 | (uint64_t)b32[9] << 48 | (uint64_t)b32[8] << 56;
     r->d[3] = (uint64_t)b32[7] | (uint64_t)b32[6] << 8 | (uint64_t)b32[5] << 16 | (uint64_t)b32[4] << 24 | (uint64_t)b32[3] << 32 | (uint64_t)b32[2] << 40 | (uint64_t)b32[1] << 48 | (uint64_t)b32[0] << 56;
     over = secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r));
     if (overflow) {
         *overflow = over;
     }
 }
 
 static void secp256k1_scalar_get_b32(unsigned char *bin, const secp256k1_scalar* a) {
     bin[0] = a->d[3] >> 56; bin[1] = a->d[3] >> 48; bin[2] = a->d[3] >> 40; bin[3] = a->d[3] >> 32; bin[4] = a->d[3] >> 24; bin[5] = a->d[3] >> 16; bin[6] = a->d[3] >> 8; bin[7] = a->d[3];
     bin[8] = a->d[2] >> 56; bin[9] = a->d[2] >> 48; bin[10] = a->d[2] >> 40; bin[11] = a->d[2] >> 32; bin[12] = a->d[2] >> 24; bin[13] = a->d[2] >> 16; bin[14] = a->d[2] >> 8; bin[15] = a->d[2];
     bin[16] = a->d[1] >> 56; bin[17] = a->d[1] >> 48; bin[18] = a->d[1] >> 40; bin[19] = a->d[1] >> 32; bin[20] = a->d[1] >> 24; bin[21] = a->d[1] >> 16; bin[22] = a->d[1] >> 8; bin[23] = a->d[1];
     bin[24] = a->d[0] >> 56; bin[25] = a->d[0] >> 48; bin[26] = a->d[0] >> 40; bin[27] = a->d[0] >> 32; bin[28] = a->d[0] >> 24; bin[29] = a->d[0] >> 16; bin[30] = a->d[0] >> 8; bin[31] = a->d[0];
 }
 
 SECP256K1_INLINE static int secp256k1_scalar_is_zero(const secp256k1_scalar *a) {
     return (a->d[0] | a->d[1] | a->d[2] | a->d[3]) == 0;
 }
 
 static void secp256k1_scalar_negate(secp256k1_scalar *r, const secp256k1_scalar *a) {
     uint64_t nonzero = 0xFFFFFFFFFFFFFFFFULL * (secp256k1_scalar_is_zero(a) == 0);
     uint128_t t = (uint128_t)(~a->d[0]) + SECP256K1_N_0 + 1;
     r->d[0] = t & nonzero; t >>= 64;
     t += (uint128_t)(~a->d[1]) + SECP256K1_N_1;
     r->d[1] = t & nonzero; t >>= 64;
     t += (uint128_t)(~a->d[2]) + SECP256K1_N_2;
     r->d[2] = t & nonzero; t >>= 64;
     t += (uint128_t)(~a->d[3]) + SECP256K1_N_3;
     r->d[3] = t & nonzero;
 }
 
 SECP256K1_INLINE static int secp256k1_scalar_is_one(const secp256k1_scalar *a) {
     return ((a->d[0] ^ 1) | a->d[1] | a->d[2] | a->d[3]) == 0;
 }
 
 static int secp256k1_scalar_is_high(const secp256k1_scalar *a) {
     int yes = 0;
     int no = 0;
     no |= (a->d[3] < SECP256K1_N_H_3);
     yes |= (a->d[3] > SECP256K1_N_H_3) & ~no;
     no |= (a->d[2] < SECP256K1_N_H_2) & ~yes; /* No need for a > check. */
     no |= (a->d[1] < SECP256K1_N_H_1) & ~yes;
     yes |= (a->d[1] > SECP256K1_N_H_1) & ~no;
     yes |= (a->d[0] > SECP256K1_N_H_0) & ~no;
     return yes;
 }
 
 static int secp256k1_scalar_cond_negate(secp256k1_scalar *r, int flag) {
     /* If we are flag = 0, mask = 00...00 and this is a no-op;
      * if we are flag = 1, mask = 11...11 and this is identical to secp256k1_scalar_negate */
     uint64_t mask = !flag - 1;
     uint64_t nonzero = (secp256k1_scalar_is_zero(r) != 0) - 1;
     uint128_t t = (uint128_t)(r->d[0] ^ mask) + ((SECP256K1_N_0 + 1) & mask);
     r->d[0] = t & nonzero; t >>= 64;
     t += (uint128_t)(r->d[1] ^ mask) + (SECP256K1_N_1 & mask);
     r->d[1] = t & nonzero; t >>= 64;
     t += (uint128_t)(r->d[2] ^ mask) + (SECP256K1_N_2 & mask);
     r->d[2] = t & nonzero; t >>= 64;
     t += (uint128_t)(r->d[3] ^ mask) + (SECP256K1_N_3 & mask);
     r->d[3] = t & nonzero;
     return 2 * (mask == 0) - 1;
 }
 
 /* Inspired by the macros in OpenSSL's crypto/bn/asm/x86_64-gcc.c. */
 
 /** Add a*b to the number defined by (c0,c1,c2). c2 must never overflow. */
 #define muladd(a,b) { \
     uint64_t tl, th; \
     { \
         uint128_t t = (uint128_t)a * b; \
         th = t >> 64;         /* at most 0xFFFFFFFFFFFFFFFE */ \
         tl = t; \
     } \
     c0 += tl;                 /* overflow is handled on the next line */ \
     th += (c0 < tl);          /* at most 0xFFFFFFFFFFFFFFFF */ \
     c1 += th;                 /* overflow is handled on the next line */ \
     c2 += (c1 < th);          /* never overflows by contract (verified in the next line) */ \
     VERIFY_CHECK((c1 >= th) || (c2 != 0)); \
 }
 
 /** Add a*b to the number defined by (c0,c1). c1 must never overflow. */
 #define muladd_fast(a,b) { \
     uint64_t tl, th; \
     { \
         uint128_t t = (uint128_t)a * b; \
         th = t >> 64;         /* at most 0xFFFFFFFFFFFFFFFE */ \
         tl = t; \
     } \
     c0 += tl;                 /* overflow is handled on the next line */ \
     th += (c0 < tl);          /* at most 0xFFFFFFFFFFFFFFFF */ \
     c1 += th;                 /* never overflows by contract (verified in the next line) */ \
     VERIFY_CHECK(c1 >= th); \
 }
 
 /** Add 2*a*b to the number defined by (c0,c1,c2). c2 must never overflow. */
 #define muladd2(a,b) { \
     uint64_t tl, th, th2, tl2; \
     { \
         uint128_t t = (uint128_t)a * b; \
         th = t >> 64;               /* at most 0xFFFFFFFFFFFFFFFE */ \
         tl = t; \
     } \
     th2 = th + th;                  /* at most 0xFFFFFFFFFFFFFFFE (in case th was 0x7FFFFFFFFFFFFFFF) */ \
     c2 += (th2 < th);               /* never overflows by contract (verified the next line) */ \
     VERIFY_CHECK((th2 >= th) || (c2 != 0)); \
     tl2 = tl + tl;                  /* at most 0xFFFFFFFFFFFFFFFE (in case the lowest 63 bits of tl were 0x7FFFFFFFFFFFFFFF) */ \
     th2 += (tl2 < tl);              /* at most 0xFFFFFFFFFFFFFFFF */ \
     c0 += tl2;                      /* overflow is handled on the next line */ \
     th2 += (c0 < tl2);              /* second overflow is handled on the next line */ \
     c2 += (c0 < tl2) & (th2 == 0);  /* never overflows by contract (verified the next line) */ \
     VERIFY_CHECK((c0 >= tl2) || (th2 != 0) || (c2 != 0)); \
     c1 += th2;                      /* overflow is handled on the next line */ \
     c2 += (c1 < th2);               /* never overflows by contract (verified the next line) */ \
     VERIFY_CHECK((c1 >= th2) || (c2 != 0)); \
 }
 
 /** Add a to the number defined by (c0,c1,c2). c2 must never overflow. */
 #define sumadd(a) { \
     unsigned int over; \
     c0 += (a);                  /* overflow is handled on the next line */ \
     over = (c0 < (a));         \
     c1 += over;                 /* overflow is handled on the next line */ \
     c2 += (c1 < over);          /* never overflows by contract */ \
 }
 
 /** Add a to the number defined by (c0,c1). c1 must never overflow, c2 must be zero. */
 #define sumadd_fast(a) { \
     c0 += (a);                 /* overflow is handled on the next line */ \
     c1 += (c0 < (a));          /* never overflows by contract (verified the next line) */ \
     VERIFY_CHECK((c1 != 0) | (c0 >= (a))); \
     VERIFY_CHECK(c2 == 0); \
 }
 
 /** Extract the lowest 64 bits of (c0,c1,c2) into n, and left shift the number 64 bits. */
 #define extract(n) { \
     (n) = c0; \
     c0 = c1; \
     c1 = c2; \
     c2 = 0; \
 }
 
 /** Extract the lowest 64 bits of (c0,c1,c2) into n, and left shift the number 64 bits. c2 is required to be zero. */
 #define extract_fast(n) { \
     (n) = c0; \
     c0 = c1; \
     c1 = 0; \
     VERIFY_CHECK(c2 == 0); \
 }
 
 static void secp256k1_scalar_reduce_512(secp256k1_scalar *r, const uint64_t *l) {
 #ifdef USE_ASM_X86_64
     /* Reduce 512 bits into 385. */
     uint64_t m0, m1, m2, m3, m4, m5, m6;
     uint64_t p0, p1, p2, p3, p4;
     uint64_t c;
 
     __asm__ __volatile__(
     /* Preload. */
     "movq 32(%%rsi), %%r11\n"
     "movq 40(%%rsi), %%r12\n"
     "movq 48(%%rsi), %%r13\n"
     "movq 56(%%rsi), %%r14\n"
     /* Initialize r8,r9,r10 */
     "movq 0(%%rsi), %%r8\n"
     "xorq %%r9, %%r9\n"
     "xorq %%r10, %%r10\n"
     /* (r8,r9) += n0 * c0 */
     "movq %8, %%rax\n"
     "mulq %%r11\n"
     "addq %%rax, %%r8\n"
     "adcq %%rdx, %%r9\n"
     /* extract m0 */
     "movq %%r8, %q0\n"
     "xorq %%r8, %%r8\n"
     /* (r9,r10) += l1 */
     "addq 8(%%rsi), %%r9\n"
     "adcq $0, %%r10\n"
     /* (r9,r10,r8) += n1 * c0 */
     "movq %8, %%rax\n"
     "mulq %%r12\n"
     "addq %%rax, %%r9\n"
     "adcq %%rdx, %%r10\n"
     "adcq $0, %%r8\n"
     /* (r9,r10,r8) += n0 * c1 */
     "movq %9, %%rax\n"
     "mulq %%r11\n"
     "addq %%rax, %%r9\n"
     "adcq %%rdx, %%r10\n"
     "adcq $0, %%r8\n"
     /* extract m1 */
     "movq %%r9, %q1\n"
     "xorq %%r9, %%r9\n"
     /* (r10,r8,r9) += l2 */
     "addq 16(%%rsi), %%r10\n"
     "adcq $0, %%r8\n"
     "adcq $0, %%r9\n"
     /* (r10,r8,r9) += n2 * c0 */
     "movq %8, %%rax\n"
     "mulq %%r13\n"
     "addq %%rax, %%r10\n"
     "adcq %%rdx, %%r8\n"
     "adcq $0, %%r9\n"
     /* (r10,r8,r9) += n1 * c1 */
     "movq %9, %%rax\n"
     "mulq %%r12\n"
     "addq %%rax, %%r10\n"
     "adcq %%rdx, %%r8\n"
     "adcq $0, %%r9\n"
     /* (r10,r8,r9) += n0 */
     "addq %%r11, %%r10\n"
     "adcq $0, %%r8\n"
     "adcq $0, %%r9\n"
     /* extract m2 */
     "movq %%r10, %q2\n"
     "xorq %%r10, %%r10\n"
     /* (r8,r9,r10) += l3 */
     "addq 24(%%rsi), %%r8\n"
     "adcq $0, %%r9\n"
     "adcq $0, %%r10\n"
     /* (r8,r9,r10) += n3 * c0 */
     "movq %8, %%rax\n"
     "mulq %%r14\n"
     "addq %%rax, %%r8\n"
     "adcq %%rdx, %%r9\n"
     "adcq $0, %%r10\n"
     /* (r8,r9,r10) += n2 * c1 */
     "movq %9, %%rax\n"
     "mulq %%r13\n"
     "addq %%rax, %%r8\n"
     "adcq %%rdx, %%r9\n"
     "adcq $0, %%r10\n"
     /* (r8,r9,r10) += n1 */
     "addq %%r12, %%r8\n"
     "adcq $0, %%r9\n"
     "adcq $0, %%r10\n"
     /* extract m3 */
     "movq %%r8, %q3\n"
     "xorq %%r8, %%r8\n"
     /* (r9,r10,r8) += n3 * c1 */
     "movq %9, %%rax\n"
     "mulq %%r14\n"
     "addq %%rax, %%r9\n"
     "adcq %%rdx, %%r10\n"
     "adcq $0, %%r8\n"
     /* (r9,r10,r8) += n2 */
     "addq %%r13, %%r9\n"
     "adcq $0, %%r10\n"
     "adcq $0, %%r8\n"
     /* extract m4 */
     "movq %%r9, %q4\n"
     /* (r10,r8) += n3 */
     "addq %%r14, %%r10\n"
     "adcq $0, %%r8\n"
     /* extract m5 */
     "movq %%r10, %q5\n"
     /* extract m6 */
     "movq %%r8, %q6\n"
     : "=g"(m0), "=g"(m1), "=g"(m2), "=g"(m3), "=g"(m4), "=g"(m5), "=g"(m6)
     : "S"(l), "i"(SECP256K1_N_C_0), "i"(SECP256K1_N_C_1)
     : "rax", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "cc");
 
     /* Reduce 385 bits into 258. */
     __asm__ __volatile__(
     /* Preload */
     "movq %q9, %%r11\n"
     "movq %q10, %%r12\n"
     "movq %q11, %%r13\n"
     /* Initialize (r8,r9,r10) */
     "movq %q5, %%r8\n"
     "xorq %%r9, %%r9\n"
     "xorq %%r10, %%r10\n"
     /* (r8,r9) += m4 * c0 */
     "movq %12, %%rax\n"
     "mulq %%r11\n"
     "addq %%rax, %%r8\n"
     "adcq %%rdx, %%r9\n"
     /* extract p0 */
     "movq %%r8, %q0\n"
     "xorq %%r8, %%r8\n"
     /* (r9,r10) += m1 */
     "addq %q6, %%r9\n"
     "adcq $0, %%r10\n"
     /* (r9,r10,r8) += m5 * c0 */
     "movq %12, %%rax\n"
     "mulq %%r12\n"
     "addq %%rax, %%r9\n"
     "adcq %%rdx, %%r10\n"
     "adcq $0, %%r8\n"
     /* (r9,r10,r8) += m4 * c1 */
     "movq %13, %%rax\n"
     "mulq %%r11\n"
     "addq %%rax, %%r9\n"
     "adcq %%rdx, %%r10\n"
     "adcq $0, %%r8\n"
     /* extract p1 */
     "movq %%r9, %q1\n"
     "xorq %%r9, %%r9\n"
     /* (r10,r8,r9) += m2 */
     "addq %q7, %%r10\n"
     "adcq $0, %%r8\n"
     "adcq $0, %%r9\n"
     /* (r10,r8,r9) += m6 * c0 */
     "movq %12, %%rax\n"
     "mulq %%r13\n"
     "addq %%rax, %%r10\n"
     "adcq %%rdx, %%r8\n"
     "adcq $0, %%r9\n"
     /* (r10,r8,r9) += m5 * c1 */
     "movq %13, %%rax\n"
     "mulq %%r12\n"
     "addq %%rax, %%r10\n"
     "adcq %%rdx, %%r8\n"
     "adcq $0, %%r9\n"
     /* (r10,r8,r9) += m4 */
     "addq %%r11, %%r10\n"
     "adcq $0, %%r8\n"
     "adcq $0, %%r9\n"
     /* extract p2 */
     "movq %%r10, %q2\n"
     /* (r8,r9) += m3 */
     "addq %q8, %%r8\n"
     "adcq $0, %%r9\n"
     /* (r8,r9) += m6 * c1 */
     "movq %13, %%rax\n"
     "mulq %%r13\n"
     "addq %%rax, %%r8\n"
     "adcq %%rdx, %%r9\n"
     /* (r8,r9) += m5 */
     "addq %%r12, %%r8\n"
     "adcq $0, %%r9\n"
     /* extract p3 */
     "movq %%r8, %q3\n"
     /* (r9) += m6 */
     "addq %%r13, %%r9\n"
     /* extract p4 */
     "movq %%r9, %q4\n"
     : "=&g"(p0), "=&g"(p1), "=&g"(p2), "=g"(p3), "=g"(p4)
     : "g"(m0), "g"(m1), "g"(m2), "g"(m3), "g"(m4), "g"(m5), "g"(m6), "i"(SECP256K1_N_C_0), "i"(SECP256K1_N_C_1)
     : "rax", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "cc");
 
     /* Reduce 258 bits into 256. */
     __asm__ __volatile__(
     /* Preload */
     "movq %q5, %%r10\n"
     /* (rax,rdx) = p4 * c0 */
     "movq %7, %%rax\n"
     "mulq %%r10\n"
     /* (rax,rdx) += p0 */
     "addq %q1, %%rax\n"
     "adcq $0, %%rdx\n"
     /* extract r0 */
     "movq %%rax, 0(%q6)\n"
     /* Move to (r8,r9) */
     "movq %%rdx, %%r8\n"
     "xorq %%r9, %%r9\n"
     /* (r8,r9) += p1 */
     "addq %q2, %%r8\n"
     "adcq $0, %%r9\n"
     /* (r8,r9) += p4 * c1 */
     "movq %8, %%rax\n"
     "mulq %%r10\n"
     "addq %%rax, %%r8\n"
     "adcq %%rdx, %%r9\n"
     /* Extract r1 */
     "movq %%r8, 8(%q6)\n"
     "xorq %%r8, %%r8\n"
     /* (r9,r8) += p4 */
     "addq %%r10, %%r9\n"
     "adcq $0, %%r8\n"
     /* (r9,r8) += p2 */
     "addq %q3, %%r9\n"
     "adcq $0, %%r8\n"
     /* Extract r2 */
     "movq %%r9, 16(%q6)\n"
     "xorq %%r9, %%r9\n"
     /* (r8,r9) += p3 */
     "addq %q4, %%r8\n"
     "adcq $0, %%r9\n"
     /* Extract r3 */
     "movq %%r8, 24(%q6)\n"
     /* Extract c */
     "movq %%r9, %q0\n"
     : "=g"(c)
     : "g"(p0), "g"(p1), "g"(p2), "g"(p3), "g"(p4), "D"(r), "i"(SECP256K1_N_C_0), "i"(SECP256K1_N_C_1)
     : "rax", "rdx", "r8", "r9", "r10", "cc", "memory");
 #else
     uint128_t c;
     uint64_t c0, c1, c2;
     uint64_t n0 = l[4], n1 = l[5], n2 = l[6], n3 = l[7];
     uint64_t m0, m1, m2, m3, m4, m5;
     uint32_t m6;
     uint64_t p0, p1, p2, p3;
     uint32_t p4;
 
     /* Reduce 512 bits into 385. */
     /* m[0..6] = l[0..3] + n[0..3] * SECP256K1_N_C. */
     c0 = l[0]; c1 = 0; c2 = 0;
     muladd_fast(n0, SECP256K1_N_C_0);
     extract_fast(m0);
     sumadd_fast(l[1]);
     muladd(n1, SECP256K1_N_C_0);
     muladd(n0, SECP256K1_N_C_1);
     extract(m1);
     sumadd(l[2]);
     muladd(n2, SECP256K1_N_C_0);
     muladd(n1, SECP256K1_N_C_1);
     sumadd(n0);
     extract(m2);
     sumadd(l[3]);
     muladd(n3, SECP256K1_N_C_0);
     muladd(n2, SECP256K1_N_C_1);
     sumadd(n1);
     extract(m3);
     muladd(n3, SECP256K1_N_C_1);
     sumadd(n2);
     extract(m4);
     sumadd_fast(n3);
     extract_fast(m5);
     VERIFY_CHECK(c0 <= 1);
     m6 = c0;
 
     /* Reduce 385 bits into 258. */
     /* p[0..4] = m[0..3] + m[4..6] * SECP256K1_N_C. */
     c0 = m0; c1 = 0; c2 = 0;
     muladd_fast(m4, SECP256K1_N_C_0);
     extract_fast(p0);
     sumadd_fast(m1);
     muladd(m5, SECP256K1_N_C_0);
     muladd(m4, SECP256K1_N_C_1);
     extract(p1);
     sumadd(m2);
     muladd(m6, SECP256K1_N_C_0);
     muladd(m5, SECP256K1_N_C_1);
     sumadd(m4);
     extract(p2);
     sumadd_fast(m3);
     muladd_fast(m6, SECP256K1_N_C_1);
     sumadd_fast(m5);
     extract_fast(p3);
     p4 = c0 + m6;
     VERIFY_CHECK(p4 <= 2);
 
     /* Reduce 258 bits into 256. */
     /* r[0..3] = p[0..3] + p[4] * SECP256K1_N_C. */
     c = p0 + (uint128_t)SECP256K1_N_C_0 * p4;
     r->d[0] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
     c += p1 + (uint128_t)SECP256K1_N_C_1 * p4;
     r->d[1] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
     c += p2 + (uint128_t)p4;
     r->d[2] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
     c += p3;
     r->d[3] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
 #endif
 
     /* Final reduction of r. */
     secp256k1_scalar_reduce(r, c + secp256k1_scalar_check_overflow(r));
 }
 
 static void secp256k1_scalar_mul_512(uint64_t l[8], const secp256k1_scalar *a, const secp256k1_scalar *b) {
 #ifdef USE_ASM_X86_64
     const uint64_t *pb = b->d;
     __asm__ __volatile__(
     /* Preload */
     "movq 0(%%rdi), %%r15\n"
     "movq 8(%%rdi), %%rbx\n"
     "movq 16(%%rdi), %%rcx\n"
     "movq 0(%%rdx), %%r11\n"
     "movq 8(%%rdx), %%r12\n"
     "movq 16(%%rdx), %%r13\n"
     "movq 24(%%rdx), %%r14\n"
     /* (rax,rdx) = a0 * b0 */
     "movq %%r15, %%rax\n"
     "mulq %%r11\n"
     /* Extract l0 */
     "movq %%rax, 0(%%rsi)\n"
     /* (r8,r9,r10) = (rdx) */
     "movq %%rdx, %%r8\n"
     "xorq %%r9, %%r9\n"
     "xorq %%r10, %%r10\n"
     /* (r8,r9,r10) += a0 * b1 */
     "movq %%r15, %%rax\n"
     "mulq %%r12\n"
     "addq %%rax, %%r8\n"
     "adcq %%rdx, %%r9\n"
     "adcq $0, %%r10\n"
     /* (r8,r9,r10) += a1 * b0 */
     "movq %%rbx, %%rax\n"
     "mulq %%r11\n"
     "addq %%rax, %%r8\n"
     "adcq %%rdx, %%r9\n"
     "adcq $0, %%r10\n"
     /* Extract l1 */
     "movq %%r8, 8(%%rsi)\n"
     "xorq %%r8, %%r8\n"
     /* (r9,r10,r8) += a0 * b2 */
     "movq %%r15, %%rax\n"
     "mulq %%r13\n"
     "addq %%rax, %%r9\n"
     "adcq %%rdx, %%r10\n"
     "adcq $0, %%r8\n"
     /* (r9,r10,r8) += a1 * b1 */
     "movq %%rbx, %%rax\n"
     "mulq %%r12\n"
     "addq %%rax, %%r9\n"
     "adcq %%rdx, %%r10\n"
     "adcq $0, %%r8\n"
     /* (r9,r10,r8) += a2 * b0 */
     "movq %%rcx, %%rax\n"
     "mulq %%r11\n"
     "addq %%rax, %%r9\n"
     "adcq %%rdx, %%r10\n"
     "adcq $0, %%r8\n"
     /* Extract l2 */
     "movq %%r9, 16(%%rsi)\n"
     "xorq %%r9, %%r9\n"
     /* (r10,r8,r9) += a0 * b3 */
     "movq %%r15, %%rax\n"
     "mulq %%r14\n"
     "addq %%rax, %%r10\n"
     "adcq %%rdx, %%r8\n"
     "adcq $0, %%r9\n"
     /* Preload a3 */
     "movq 24(%%rdi), %%r15\n"
     /* (r10,r8,r9) += a1 * b2 */
     "movq %%rbx, %%rax\n"
     "mulq %%r13\n"
     "addq %%rax, %%r10\n"
     "adcq %%rdx, %%r8\n"
     "adcq $0, %%r9\n"
     /* (r10,r8,r9) += a2 * b1 */
     "movq %%rcx, %%rax\n"
     "mulq %%r12\n"
     "addq %%rax, %%r10\n"
     "adcq %%rdx, %%r8\n"
     "adcq $0, %%r9\n"
     /* (r10,r8,r9) += a3 * b0 */
     "movq %%r15, %%rax\n"
     "mulq %%r11\n"
     "addq %%rax, %%r10\n"
     "adcq %%rdx, %%r8\n"
     "adcq $0, %%r9\n"
     /* Extract l3 */
     "movq %%r10, 24(%%rsi)\n"
     "xorq %%r10, %%r10\n"
     /* (r8,r9,r10) += a1 * b3 */
     "movq %%rbx, %%rax\n"
     "mulq %%r14\n"
     "addq %%rax, %%r8\n"
     "adcq %%rdx, %%r9\n"
     "adcq $0, %%r10\n"
     /* (r8,r9,r10) += a2 * b2 */
     "movq %%rcx, %%rax\n"
     "mulq %%r13\n"
     "addq %%rax, %%r8\n"
     "adcq %%rdx, %%r9\n"
     "adcq $0, %%r10\n"
     /* (r8,r9,r10) += a3 * b1 */
     "movq %%r15, %%rax\n"
     "mulq %%r12\n"
     "addq %%rax, %%r8\n"
     "adcq %%rdx, %%r9\n"
     "adcq $0, %%r10\n"
     /* Extract l4 */
     "movq %%r8, 32(%%rsi)\n"
     "xorq %%r8, %%r8\n"
     /* (r9,r10,r8) += a2 * b3 */
     "movq %%rcx, %%rax\n"
     "mulq %%r14\n"
     "addq %%rax, %%r9\n"
     "adcq %%rdx, %%r10\n"
     "adcq $0, %%r8\n"
     /* (r9,r10,r8) += a3 * b2 */
     "movq %%r15, %%rax\n"
     "mulq %%r13\n"
     "addq %%rax, %%r9\n"
     "adcq %%rdx, %%r10\n"
     "adcq $0, %%r8\n"
     /* Extract l5 */
     "movq %%r9, 40(%%rsi)\n"
     /* (r10,r8) += a3 * b3 */
     "movq %%r15, %%rax\n"
     "mulq %%r14\n"
     "addq %%rax, %%r10\n"
     "adcq %%rdx, %%r8\n"
     /* Extract l6 */
     "movq %%r10, 48(%%rsi)\n"
     /* Extract l7 */
     "movq %%r8, 56(%%rsi)\n"
     : "+d"(pb)
     : "S"(l), "D"(a->d)
     : "rax", "rbx", "rcx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15", "cc", "memory");
 #else
     /* 160 bit accumulator. */
     uint64_t c0 = 0, c1 = 0;
     uint32_t c2 = 0;
 
     /* l[0..7] = a[0..3] * b[0..3]. */
     muladd_fast(a->d[0], b->d[0]);
     extract_fast(l[0]);
     muladd(a->d[0], b->d[1]);
     muladd(a->d[1], b->d[0]);
     extract(l[1]);
     muladd(a->d[0], b->d[2]);
     muladd(a->d[1], b->d[1]);
     muladd(a->d[2], b->d[0]);
     extract(l[2]);
     muladd(a->d[0], b->d[3]);
     muladd(a->d[1], b->d[2]);
     muladd(a->d[2], b->d[1]);
     muladd(a->d[3], b->d[0]);
     extract(l[3]);
     muladd(a->d[1], b->d[3]);
     muladd(a->d[2], b->d[2]);
     muladd(a->d[3], b->d[1]);
     extract(l[4]);
     muladd(a->d[2], b->d[3]);
     muladd(a->d[3], b->d[2]);
     extract(l[5]);
     muladd_fast(a->d[3], b->d[3]);
     extract_fast(l[6]);
     VERIFY_CHECK(c1 == 0);
     l[7] = c0;
 #endif
 }
 
 static void secp256k1_scalar_sqr_512(uint64_t l[8], const secp256k1_scalar *a) {
 #ifdef USE_ASM_X86_64
     __asm__ __volatile__(
     /* Preload */
     "movq 0(%%rdi), %%r11\n"
     "movq 8(%%rdi), %%r12\n"
     "movq 16(%%rdi), %%r13\n"
     "movq 24(%%rdi), %%r14\n"
     /* (rax,rdx) = a0 * a0 */
     "movq %%r11, %%rax\n"
     "mulq %%r11\n"
     /* Extract l0 */
     "movq %%rax, 0(%%rsi)\n"
     /* (r8,r9,r10) = (rdx,0) */
     "movq %%rdx, %%r8\n"
     "xorq %%r9, %%r9\n"
     "xorq %%r10, %%r10\n"
     /* (r8,r9,r10) += 2 * a0 * a1 */
     "movq %%r11, %%rax\n"
     "mulq %%r12\n"
     "addq %%rax, %%r8\n"
     "adcq %%rdx, %%r9\n"
     "adcq $0, %%r10\n"
     "addq %%rax, %%r8\n"
     "adcq %%rdx, %%r9\n"
     "adcq $0, %%r10\n"
     /* Extract l1 */
     "movq %%r8, 8(%%rsi)\n"
     "xorq %%r8, %%r8\n"
     /* (r9,r10,r8) += 2 * a0 * a2 */
     "movq %%r11, %%rax\n"
     "mulq %%r13\n"
     "addq %%rax, %%r9\n"
     "adcq %%rdx, %%r10\n"
     "adcq $0, %%r8\n"
     "addq %%rax, %%r9\n"
     "adcq %%rdx, %%r10\n"
     "adcq $0, %%r8\n"
     /* (r9,r10,r8) += a1 * a1 */
     "movq %%r12, %%rax\n"
     "mulq %%r12\n"
     "addq %%rax, %%r9\n"
     "adcq %%rdx, %%r10\n"
     "adcq $0, %%r8\n"
     /* Extract l2 */
     "movq %%r9, 16(%%rsi)\n"
     "xorq %%r9, %%r9\n"
     /* (r10,r8,r9) += 2 * a0 * a3 */
     "movq %%r11, %%rax\n"
     "mulq %%r14\n"
     "addq %%rax, %%r10\n"
     "adcq %%rdx, %%r8\n"
     "adcq $0, %%r9\n"
     "addq %%rax, %%r10\n"
     "adcq %%rdx, %%r8\n"
     "adcq $0, %%r9\n"
     /* (r10,r8,r9) += 2 * a1 * a2 */
     "movq %%r12, %%rax\n"
     "mulq %%r13\n"
     "addq %%rax, %%r10\n"
     "adcq %%rdx, %%r8\n"
     "adcq $0, %%r9\n"
     "addq %%rax, %%r10\n"
     "adcq %%rdx, %%r8\n"
     "adcq $0, %%r9\n"
     /* Extract l3 */
     "movq %%r10, 24(%%rsi)\n"
     "xorq %%r10, %%r10\n"
     /* (r8,r9,r10) += 2 * a1 * a3 */
     "movq %%r12, %%rax\n"
     "mulq %%r14\n"
     "addq %%rax, %%r8\n"
     "adcq %%rdx, %%r9\n"
     "adcq $0, %%r10\n"
     "addq %%rax, %%r8\n"
     "adcq %%rdx, %%r9\n"
     "adcq $0, %%r10\n"
     /* (r8,r9,r10) += a2 * a2 */
     "movq %%r13, %%rax\n"
     "mulq %%r13\n"
     "addq %%rax, %%r8\n"
     "adcq %%rdx, %%r9\n"
     "adcq $0, %%r10\n"
     /* Extract l4 */
     "movq %%r8, 32(%%rsi)\n"
     "xorq %%r8, %%r8\n"
     /* (r9,r10,r8) += 2 * a2 * a3 */
     "movq %%r13, %%rax\n"
     "mulq %%r14\n"
     "addq %%rax, %%r9\n"
     "adcq %%rdx, %%r10\n"
     "adcq $0, %%r8\n"
     "addq %%rax, %%r9\n"
     "adcq %%rdx, %%r10\n"
     "adcq $0, %%r8\n"
     /* Extract l5 */
     "movq %%r9, 40(%%rsi)\n"
     /* (r10,r8) += a3 * a3 */
     "movq %%r14, %%rax\n"
     "mulq %%r14\n"
     "addq %%rax, %%r10\n"
     "adcq %%rdx, %%r8\n"
     /* Extract l6 */
     "movq %%r10, 48(%%rsi)\n"
     /* Extract l7 */
     "movq %%r8, 56(%%rsi)\n"
     :
     : "S"(l), "D"(a->d)
     : "rax", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "cc", "memory");
 #else
     /* 160 bit accumulator. */
     uint64_t c0 = 0, c1 = 0;
     uint32_t c2 = 0;
 
     /* l[0..7] = a[0..3] * b[0..3]. */
     muladd_fast(a->d[0], a->d[0]);
     extract_fast(l[0]);
     muladd2(a->d[0], a->d[1]);
     extract(l[1]);
     muladd2(a->d[0], a->d[2]);
     muladd(a->d[1], a->d[1]);
     extract(l[2]);
     muladd2(a->d[0], a->d[3]);
     muladd2(a->d[1], a->d[2]);
     extract(l[3]);
     muladd2(a->d[1], a->d[3]);
     muladd(a->d[2], a->d[2]);
     extract(l[4]);
     muladd2(a->d[2], a->d[3]);
     extract(l[5]);
     muladd_fast(a->d[3], a->d[3]);
     extract_fast(l[6]);
     VERIFY_CHECK(c1 == 0);
     l[7] = c0;
 #endif
 }
 
 #undef sumadd
 #undef sumadd_fast
 #undef muladd
 #undef muladd_fast
 #undef muladd2
 #undef extract
 #undef extract_fast
 
 static void secp256k1_scalar_mul(secp256k1_scalar *r, const secp256k1_scalar *a, const secp256k1_scalar *b) {
     uint64_t l[8];
     secp256k1_scalar_mul_512(l, a, b);
     secp256k1_scalar_reduce_512(r, l);
 }
 
 static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n) {
     int ret;
     VERIFY_CHECK(n > 0);
     VERIFY_CHECK(n < 16);
     ret = r->d[0] & ((1 << n) - 1);
     r->d[0] = (r->d[0] >> n) + (r->d[1] << (64 - n));
     r->d[1] = (r->d[1] >> n) + (r->d[2] << (64 - n));
     r->d[2] = (r->d[2] >> n) + (r->d[3] << (64 - n));
     r->d[3] = (r->d[3] >> n);
     return ret;
 }
 
 static void secp256k1_scalar_sqr(secp256k1_scalar *r, const secp256k1_scalar *a) {
     uint64_t l[8];
     secp256k1_scalar_sqr_512(l, a);
     secp256k1_scalar_reduce_512(r, l);
 }
 
 static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *k) {
     r1->d[0] = k->d[0];
     r1->d[1] = k->d[1];
     r1->d[2] = 0;
     r1->d[3] = 0;
     r2->d[0] = k->d[2];
     r2->d[1] = k->d[3];
     r2->d[2] = 0;
     r2->d[3] = 0;
 }
 
 SECP256K1_INLINE static int secp256k1_scalar_eq(const secp256k1_scalar *a, const secp256k1_scalar *b) {
     return ((a->d[0] ^ b->d[0]) | (a->d[1] ^ b->d[1]) | (a->d[2] ^ b->d[2]) | (a->d[3] ^ b->d[3])) == 0;
 }
 
 SECP256K1_INLINE static void secp256k1_scalar_mul_shift_var(secp256k1_scalar *r, const secp256k1_scalar *a, const secp256k1_scalar *b, unsigned int shift) {
     uint64_t l[8];
     unsigned int shiftlimbs;
     unsigned int shiftlow;
     unsigned int shifthigh;
     VERIFY_CHECK(shift >= 256);
     secp256k1_scalar_mul_512(l, a, b);
     shiftlimbs = shift >> 6;
     shiftlow = shift & 0x3F;
     shifthigh = 64 - shiftlow;
     r->d[0] = shift < 512 ? (l[0 + shiftlimbs] >> shiftlow | (shift < 448 && shiftlow ? (l[1 + shiftlimbs] << shifthigh) : 0)) : 0;
     r->d[1] = shift < 448 ? (l[1 + shiftlimbs] >> shiftlow | (shift < 384 && shiftlow ? (l[2 + shiftlimbs] << shifthigh) : 0)) : 0;
     r->d[2] = shift < 384 ? (l[2 + shiftlimbs] >> shiftlow | (shift < 320 && shiftlow ? (l[3 + shiftlimbs] << shifthigh) : 0)) : 0;
     r->d[3] = shift < 320 ? (l[3 + shiftlimbs] >> shiftlow) : 0;
     secp256k1_scalar_cadd_bit(r, 0, (l[(shift - 1) >> 6] >> ((shift - 1) & 0x3f)) & 1);
 }
 
 static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const secp256k1_scalar *a, int flag) {
     uint64_t mask0, mask1;
     VG_CHECK_VERIFY(r->d, sizeof(r->d));
     mask0 = flag + ~((uint64_t)0);
     mask1 = ~mask0;
     r->d[0] = (r->d[0] & mask0) | (a->d[0] & mask1);
     r->d[1] = (r->d[1] & mask0) | (a->d[1] & mask1);
     r->d[2] = (r->d[2] & mask0) | (a->d[2] & mask1);
     r->d[3] = (r->d[3] & mask0) | (a->d[3] & mask1);
 }
 
+static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
+    secp256k1_scalar *t;
+    int i;
+    /* First compute xN as x ^ (2^N - 1) for some values of N,
+     * and uM as x ^ M for some values of M. */
+    secp256k1_scalar x2, x3, x6, x8, x14, x28, x56, x112, x126;
+    secp256k1_scalar u2, u5, u9, u11, u13;
+
+    secp256k1_scalar_sqr(&u2, x);
+    secp256k1_scalar_mul(&x2, &u2,  x);
+    secp256k1_scalar_mul(&u5, &u2, &x2);
+    secp256k1_scalar_mul(&x3, &u5,  &u2);
+    secp256k1_scalar_mul(&u9, &x3, &u2);
+    secp256k1_scalar_mul(&u11, &u9, &u2);
+    secp256k1_scalar_mul(&u13, &u11, &u2);
+
+    secp256k1_scalar_sqr(&x6, &u13);
+    secp256k1_scalar_sqr(&x6, &x6);
+    secp256k1_scalar_mul(&x6, &x6, &u11);
+
+    secp256k1_scalar_sqr(&x8, &x6);
+    secp256k1_scalar_sqr(&x8, &x8);
+    secp256k1_scalar_mul(&x8, &x8,  &x2);
+
+    secp256k1_scalar_sqr(&x14, &x8);
+    for (i = 0; i < 5; i++) {
+        secp256k1_scalar_sqr(&x14, &x14);
+    }
+    secp256k1_scalar_mul(&x14, &x14, &x6);
+
+    secp256k1_scalar_sqr(&x28, &x14);
+    for (i = 0; i < 13; i++) {
+        secp256k1_scalar_sqr(&x28, &x28);
+    }
+    secp256k1_scalar_mul(&x28, &x28, &x14);
+
+    secp256k1_scalar_sqr(&x56, &x28);
+    for (i = 0; i < 27; i++) {
+        secp256k1_scalar_sqr(&x56, &x56);
+    }
+    secp256k1_scalar_mul(&x56, &x56, &x28);
+
+    secp256k1_scalar_sqr(&x112, &x56);
+    for (i = 0; i < 55; i++) {
+        secp256k1_scalar_sqr(&x112, &x112);
+    }
+    secp256k1_scalar_mul(&x112, &x112, &x56);
+
+    secp256k1_scalar_sqr(&x126, &x112);
+    for (i = 0; i < 13; i++) {
+        secp256k1_scalar_sqr(&x126, &x126);
+    }
+    secp256k1_scalar_mul(&x126, &x126, &x14);
+
+    /* Then accumulate the final result (t starts at x126). */
+    t = &x126;
+    for (i = 0; i < 3; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 5; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
+    for (i = 0; i < 4; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 5; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 6; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 3; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 5; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
+    for (i = 0; i < 6; i++) { /* 000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 10; i++) { /* 0000000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 9; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x8); /* 11111111 */
+    for (i = 0; i < 5; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
+    for (i = 0; i < 6; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
+    for (i = 0; i < 4; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 5; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x2); /* 11 */
+    for (i = 0; i < 6; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 10; i++) { /* 000000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 4; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
+    for (i = 0; i < 6; i++) { /* 00000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, x); /* 1 */
+    for (i = 0; i < 8; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(r, t, &x6); /* 111111 */
+}
+
+static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
+#if defined(USE_SCALAR_INV_BUILTIN)
+    secp256k1_scalar_inverse(r, x);
+#elif defined(USE_SCALAR_INV_NUM)
+    unsigned char b[32];
+    secp256k1_num n, m;
+    secp256k1_scalar t = *x;
+    secp256k1_scalar_get_b32(b, &t);
+    secp256k1_num_set_bin(&n, b, 32);
+    secp256k1_scalar_order_get_num(&m);
+    secp256k1_num_mod_inverse(&n, &n, &m);
+    secp256k1_num_get_bin(b, 32, &n);
+    secp256k1_scalar_set_b32(r, b, NULL);
+    /* Verify that the inverse was computed correctly, without GMP code. */
+    secp256k1_scalar_mul(&t, &t, r);
+    CHECK(secp256k1_scalar_is_one(&t));
+#else
+#error "Please select scalar inverse implementation"
+#endif
+}
+
+SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
+    return !(a->d[0] & 1);
+}
+
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */
diff --git a/src/secp256k1/src/scalar_8x32_impl.h b/src/secp256k1/src/scalar_8x32_impl.h
index bf98e01d7..53b8d4ec4 100644
--- a/src/secp256k1/src/scalar_8x32_impl.h
+++ b/src/secp256k1/src/scalar_8x32_impl.h
@@ -1,734 +1,913 @@
 /***********************************************************************
  * Copyright (c) 2014 Pieter Wuille                                    *
  * Distributed under the MIT software license, see the accompanying    *
  * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
  ***********************************************************************/
 
 #ifndef SECP256K1_SCALAR_REPR_IMPL_H
 #define SECP256K1_SCALAR_REPR_IMPL_H
 
 /* Limbs of the secp256k1 order. */
 #define SECP256K1_N_0 ((uint32_t)0xD0364141UL)
 #define SECP256K1_N_1 ((uint32_t)0xBFD25E8CUL)
 #define SECP256K1_N_2 ((uint32_t)0xAF48A03BUL)
 #define SECP256K1_N_3 ((uint32_t)0xBAAEDCE6UL)
 #define SECP256K1_N_4 ((uint32_t)0xFFFFFFFEUL)
 #define SECP256K1_N_5 ((uint32_t)0xFFFFFFFFUL)
 #define SECP256K1_N_6 ((uint32_t)0xFFFFFFFFUL)
 #define SECP256K1_N_7 ((uint32_t)0xFFFFFFFFUL)
 
 /* Limbs of 2^256 minus the secp256k1 order. */
 #define SECP256K1_N_C_0 (~SECP256K1_N_0 + 1)
 #define SECP256K1_N_C_1 (~SECP256K1_N_1)
 #define SECP256K1_N_C_2 (~SECP256K1_N_2)
 #define SECP256K1_N_C_3 (~SECP256K1_N_3)
 #define SECP256K1_N_C_4 (1)
 
 /* Limbs of half the secp256k1 order. */
 #define SECP256K1_N_H_0 ((uint32_t)0x681B20A0UL)
 #define SECP256K1_N_H_1 ((uint32_t)0xDFE92F46UL)
 #define SECP256K1_N_H_2 ((uint32_t)0x57A4501DUL)
 #define SECP256K1_N_H_3 ((uint32_t)0x5D576E73UL)
 #define SECP256K1_N_H_4 ((uint32_t)0xFFFFFFFFUL)
 #define SECP256K1_N_H_5 ((uint32_t)0xFFFFFFFFUL)
 #define SECP256K1_N_H_6 ((uint32_t)0xFFFFFFFFUL)
 #define SECP256K1_N_H_7 ((uint32_t)0x7FFFFFFFUL)
 
 SECP256K1_INLINE static void secp256k1_scalar_clear(secp256k1_scalar *r) {
     r->d[0] = 0;
     r->d[1] = 0;
     r->d[2] = 0;
     r->d[3] = 0;
     r->d[4] = 0;
     r->d[5] = 0;
     r->d[6] = 0;
     r->d[7] = 0;
 }
 
 SECP256K1_INLINE static void secp256k1_scalar_set_int(secp256k1_scalar *r, unsigned int v) {
     r->d[0] = v;
     r->d[1] = 0;
     r->d[2] = 0;
     r->d[3] = 0;
     r->d[4] = 0;
     r->d[5] = 0;
     r->d[6] = 0;
     r->d[7] = 0;
 }
 
 SECP256K1_INLINE static unsigned int secp256k1_scalar_get_bits(const secp256k1_scalar *a, unsigned int offset, unsigned int count) {
     VERIFY_CHECK((offset + count - 1) >> 5 == offset >> 5);
     return (a->d[offset >> 5] >> (offset & 0x1F)) & ((1 << count) - 1);
 }
 
 SECP256K1_INLINE static unsigned int secp256k1_scalar_get_bits_var(const secp256k1_scalar *a, unsigned int offset, unsigned int count) {
     VERIFY_CHECK(count < 32);
     VERIFY_CHECK(offset + count <= 256);
     if ((offset + count - 1) >> 5 == offset >> 5) {
         return secp256k1_scalar_get_bits(a, offset, count);
     } else {
         VERIFY_CHECK((offset >> 5) + 1 < 8);
         return ((a->d[offset >> 5] >> (offset & 0x1F)) | (a->d[(offset >> 5) + 1] << (32 - (offset & 0x1F)))) & ((((uint32_t)1) << count) - 1);
     }
 }
 
 SECP256K1_INLINE static int secp256k1_scalar_check_overflow(const secp256k1_scalar *a) {
     int yes = 0;
     int no = 0;
     no |= (a->d[7] < SECP256K1_N_7); /* No need for a > check. */
     no |= (a->d[6] < SECP256K1_N_6); /* No need for a > check. */
     no |= (a->d[5] < SECP256K1_N_5); /* No need for a > check. */
     no |= (a->d[4] < SECP256K1_N_4);
     yes |= (a->d[4] > SECP256K1_N_4) & ~no;
     no |= (a->d[3] < SECP256K1_N_3) & ~yes;
     yes |= (a->d[3] > SECP256K1_N_3) & ~no;
     no |= (a->d[2] < SECP256K1_N_2) & ~yes;
     yes |= (a->d[2] > SECP256K1_N_2) & ~no;
     no |= (a->d[1] < SECP256K1_N_1) & ~yes;
     yes |= (a->d[1] > SECP256K1_N_1) & ~no;
     yes |= (a->d[0] >= SECP256K1_N_0) & ~no;
     return yes;
 }
 
 SECP256K1_INLINE static int secp256k1_scalar_reduce(secp256k1_scalar *r, uint32_t overflow) {
     uint64_t t;
     VERIFY_CHECK(overflow <= 1);
     t = (uint64_t)r->d[0] + overflow * SECP256K1_N_C_0;
     r->d[0] = t & 0xFFFFFFFFUL; t >>= 32;
     t += (uint64_t)r->d[1] + overflow * SECP256K1_N_C_1;
     r->d[1] = t & 0xFFFFFFFFUL; t >>= 32;
     t += (uint64_t)r->d[2] + overflow * SECP256K1_N_C_2;
     r->d[2] = t & 0xFFFFFFFFUL; t >>= 32;
     t += (uint64_t)r->d[3] + overflow * SECP256K1_N_C_3;
     r->d[3] = t & 0xFFFFFFFFUL; t >>= 32;
     t += (uint64_t)r->d[4] + overflow * SECP256K1_N_C_4;
     r->d[4] = t & 0xFFFFFFFFUL; t >>= 32;
     t += (uint64_t)r->d[5];
     r->d[5] = t & 0xFFFFFFFFUL; t >>= 32;
     t += (uint64_t)r->d[6];
     r->d[6] = t & 0xFFFFFFFFUL; t >>= 32;
     t += (uint64_t)r->d[7];
     r->d[7] = t & 0xFFFFFFFFUL;
     return overflow;
 }
 
 static int secp256k1_scalar_add(secp256k1_scalar *r, const secp256k1_scalar *a, const secp256k1_scalar *b) {
     int overflow;
     uint64_t t = (uint64_t)a->d[0] + b->d[0];
     r->d[0] = t & 0xFFFFFFFFULL; t >>= 32;
     t += (uint64_t)a->d[1] + b->d[1];
     r->d[1] = t & 0xFFFFFFFFULL; t >>= 32;
     t += (uint64_t)a->d[2] + b->d[2];
     r->d[2] = t & 0xFFFFFFFFULL; t >>= 32;
     t += (uint64_t)a->d[3] + b->d[3];
     r->d[3] = t & 0xFFFFFFFFULL; t >>= 32;
     t += (uint64_t)a->d[4] + b->d[4];
     r->d[4] = t & 0xFFFFFFFFULL; t >>= 32;
     t += (uint64_t)a->d[5] + b->d[5];
     r->d[5] = t & 0xFFFFFFFFULL; t >>= 32;
     t += (uint64_t)a->d[6] + b->d[6];
     r->d[6] = t & 0xFFFFFFFFULL; t >>= 32;
     t += (uint64_t)a->d[7] + b->d[7];
     r->d[7] = t & 0xFFFFFFFFULL; t >>= 32;
     overflow = t + secp256k1_scalar_check_overflow(r);
     VERIFY_CHECK(overflow == 0 || overflow == 1);
     secp256k1_scalar_reduce(r, overflow);
     return overflow;
 }
 
 static void secp256k1_scalar_cadd_bit(secp256k1_scalar *r, unsigned int bit, int flag) {
     uint64_t t;
     VERIFY_CHECK(bit < 256);
     bit += ((uint32_t) flag - 1) & 0x100;  /* forcing (bit >> 5) > 7 makes this a noop */
     t = (uint64_t)r->d[0] + (((uint32_t)((bit >> 5) == 0)) << (bit & 0x1F));
     r->d[0] = t & 0xFFFFFFFFULL; t >>= 32;
     t += (uint64_t)r->d[1] + (((uint32_t)((bit >> 5) == 1)) << (bit & 0x1F));
     r->d[1] = t & 0xFFFFFFFFULL; t >>= 32;
     t += (uint64_t)r->d[2] + (((uint32_t)((bit >> 5) == 2)) << (bit & 0x1F));
     r->d[2] = t & 0xFFFFFFFFULL; t >>= 32;
     t += (uint64_t)r->d[3] + (((uint32_t)((bit >> 5) == 3)) << (bit & 0x1F));
     r->d[3] = t & 0xFFFFFFFFULL; t >>= 32;
     t += (uint64_t)r->d[4] + (((uint32_t)((bit >> 5) == 4)) << (bit & 0x1F));
     r->d[4] = t & 0xFFFFFFFFULL; t >>= 32;
     t += (uint64_t)r->d[5] + (((uint32_t)((bit >> 5) == 5)) << (bit & 0x1F));
     r->d[5] = t & 0xFFFFFFFFULL; t >>= 32;
     t += (uint64_t)r->d[6] + (((uint32_t)((bit >> 5) == 6)) << (bit & 0x1F));
     r->d[6] = t & 0xFFFFFFFFULL; t >>= 32;
     t += (uint64_t)r->d[7] + (((uint32_t)((bit >> 5) == 7)) << (bit & 0x1F));
     r->d[7] = t & 0xFFFFFFFFULL;
 #ifdef VERIFY
     VERIFY_CHECK((t >> 32) == 0);
     VERIFY_CHECK(secp256k1_scalar_check_overflow(r) == 0);
 #endif
 }
 
 static void secp256k1_scalar_set_b32(secp256k1_scalar *r, const unsigned char *b32, int *overflow) {
     int over;
     r->d[0] = (uint32_t)b32[31] | (uint32_t)b32[30] << 8 | (uint32_t)b32[29] << 16 | (uint32_t)b32[28] << 24;
     r->d[1] = (uint32_t)b32[27] | (uint32_t)b32[26] << 8 | (uint32_t)b32[25] << 16 | (uint32_t)b32[24] << 24;
     r->d[2] = (uint32_t)b32[23] | (uint32_t)b32[22] << 8 | (uint32_t)b32[21] << 16 | (uint32_t)b32[20] << 24;
     r->d[3] = (uint32_t)b32[19] | (uint32_t)b32[18] << 8 | (uint32_t)b32[17] << 16 | (uint32_t)b32[16] << 24;
     r->d[4] = (uint32_t)b32[15] | (uint32_t)b32[14] << 8 | (uint32_t)b32[13] << 16 | (uint32_t)b32[12] << 24;
     r->d[5] = (uint32_t)b32[11] | (uint32_t)b32[10] << 8 | (uint32_t)b32[9] << 16 | (uint32_t)b32[8] << 24;
     r->d[6] = (uint32_t)b32[7] | (uint32_t)b32[6] << 8 | (uint32_t)b32[5] << 16 | (uint32_t)b32[4] << 24;
     r->d[7] = (uint32_t)b32[3] | (uint32_t)b32[2] << 8 | (uint32_t)b32[1] << 16 | (uint32_t)b32[0] << 24;
     over = secp256k1_scalar_reduce(r, secp256k1_scalar_check_overflow(r));
     if (overflow) {
         *overflow = over;
     }
 }
 
 static void secp256k1_scalar_get_b32(unsigned char *bin, const secp256k1_scalar* a) {
     bin[0] = a->d[7] >> 24; bin[1] = a->d[7] >> 16; bin[2] = a->d[7] >> 8; bin[3] = a->d[7];
     bin[4] = a->d[6] >> 24; bin[5] = a->d[6] >> 16; bin[6] = a->d[6] >> 8; bin[7] = a->d[6];
     bin[8] = a->d[5] >> 24; bin[9] = a->d[5] >> 16; bin[10] = a->d[5] >> 8; bin[11] = a->d[5];
     bin[12] = a->d[4] >> 24; bin[13] = a->d[4] >> 16; bin[14] = a->d[4] >> 8; bin[15] = a->d[4];
     bin[16] = a->d[3] >> 24; bin[17] = a->d[3] >> 16; bin[18] = a->d[3] >> 8; bin[19] = a->d[3];
     bin[20] = a->d[2] >> 24; bin[21] = a->d[2] >> 16; bin[22] = a->d[2] >> 8; bin[23] = a->d[2];
     bin[24] = a->d[1] >> 24; bin[25] = a->d[1] >> 16; bin[26] = a->d[1] >> 8; bin[27] = a->d[1];
     bin[28] = a->d[0] >> 24; bin[29] = a->d[0] >> 16; bin[30] = a->d[0] >> 8; bin[31] = a->d[0];
 }
 
 SECP256K1_INLINE static int secp256k1_scalar_is_zero(const secp256k1_scalar *a) {
     return (a->d[0] | a->d[1] | a->d[2] | a->d[3] | a->d[4] | a->d[5] | a->d[6] | a->d[7]) == 0;
 }
 
 static void secp256k1_scalar_negate(secp256k1_scalar *r, const secp256k1_scalar *a) {
     uint32_t nonzero = 0xFFFFFFFFUL * (secp256k1_scalar_is_zero(a) == 0);
     uint64_t t = (uint64_t)(~a->d[0]) + SECP256K1_N_0 + 1;
     r->d[0] = t & nonzero; t >>= 32;
     t += (uint64_t)(~a->d[1]) + SECP256K1_N_1;
     r->d[1] = t & nonzero; t >>= 32;
     t += (uint64_t)(~a->d[2]) + SECP256K1_N_2;
     r->d[2] = t & nonzero; t >>= 32;
     t += (uint64_t)(~a->d[3]) + SECP256K1_N_3;
     r->d[3] = t & nonzero; t >>= 32;
     t += (uint64_t)(~a->d[4]) + SECP256K1_N_4;
     r->d[4] = t & nonzero; t >>= 32;
     t += (uint64_t)(~a->d[5]) + SECP256K1_N_5;
     r->d[5] = t & nonzero; t >>= 32;
     t += (uint64_t)(~a->d[6]) + SECP256K1_N_6;
     r->d[6] = t & nonzero; t >>= 32;
     t += (uint64_t)(~a->d[7]) + SECP256K1_N_7;
     r->d[7] = t & nonzero;
 }
 
 SECP256K1_INLINE static int secp256k1_scalar_is_one(const secp256k1_scalar *a) {
     return ((a->d[0] ^ 1) | a->d[1] | a->d[2] | a->d[3] | a->d[4] | a->d[5] | a->d[6] | a->d[7]) == 0;
 }
 
 static int secp256k1_scalar_is_high(const secp256k1_scalar *a) {
     int yes = 0;
     int no = 0;
     no |= (a->d[7] < SECP256K1_N_H_7);
     yes |= (a->d[7] > SECP256K1_N_H_7) & ~no;
     no |= (a->d[6] < SECP256K1_N_H_6) & ~yes; /* No need for a > check. */
     no |= (a->d[5] < SECP256K1_N_H_5) & ~yes; /* No need for a > check. */
     no |= (a->d[4] < SECP256K1_N_H_4) & ~yes; /* No need for a > check. */
     no |= (a->d[3] < SECP256K1_N_H_3) & ~yes;
     yes |= (a->d[3] > SECP256K1_N_H_3) & ~no;
     no |= (a->d[2] < SECP256K1_N_H_2) & ~yes;
     yes |= (a->d[2] > SECP256K1_N_H_2) & ~no;
     no |= (a->d[1] < SECP256K1_N_H_1) & ~yes;
     yes |= (a->d[1] > SECP256K1_N_H_1) & ~no;
     yes |= (a->d[0] > SECP256K1_N_H_0) & ~no;
     return yes;
 }
 
 static int secp256k1_scalar_cond_negate(secp256k1_scalar *r, int flag) {
     /* If we are flag = 0, mask = 00...00 and this is a no-op;
      * if we are flag = 1, mask = 11...11 and this is identical to secp256k1_scalar_negate */
     uint32_t mask = !flag - 1;
     uint32_t nonzero = 0xFFFFFFFFUL * (secp256k1_scalar_is_zero(r) == 0);
     uint64_t t = (uint64_t)(r->d[0] ^ mask) + ((SECP256K1_N_0 + 1) & mask);
     r->d[0] = t & nonzero; t >>= 32;
     t += (uint64_t)(r->d[1] ^ mask) + (SECP256K1_N_1 & mask);
     r->d[1] = t & nonzero; t >>= 32;
     t += (uint64_t)(r->d[2] ^ mask) + (SECP256K1_N_2 & mask);
     r->d[2] = t & nonzero; t >>= 32;
     t += (uint64_t)(r->d[3] ^ mask) + (SECP256K1_N_3 & mask);
     r->d[3] = t & nonzero; t >>= 32;
     t += (uint64_t)(r->d[4] ^ mask) + (SECP256K1_N_4 & mask);
     r->d[4] = t & nonzero; t >>= 32;
     t += (uint64_t)(r->d[5] ^ mask) + (SECP256K1_N_5 & mask);
     r->d[5] = t & nonzero; t >>= 32;
     t += (uint64_t)(r->d[6] ^ mask) + (SECP256K1_N_6 & mask);
     r->d[6] = t & nonzero; t >>= 32;
     t += (uint64_t)(r->d[7] ^ mask) + (SECP256K1_N_7 & mask);
     r->d[7] = t & nonzero;
     return 2 * (mask == 0) - 1;
 }
 
 
 /* Inspired by the macros in OpenSSL's crypto/bn/asm/x86_64-gcc.c. */
 
 /** Add a*b to the number defined by (c0,c1,c2). c2 must never overflow. */
 #define muladd(a,b) { \
     uint32_t tl, th; \
     { \
         uint64_t t = (uint64_t)a * b; \
         th = t >> 32;         /* at most 0xFFFFFFFE */ \
         tl = t; \
     } \
     c0 += tl;                 /* overflow is handled on the next line */ \
     th += (c0 < tl);          /* at most 0xFFFFFFFF */ \
     c1 += th;                 /* overflow is handled on the next line */ \
     c2 += (c1 < th);          /* never overflows by contract (verified in the next line) */ \
     VERIFY_CHECK((c1 >= th) || (c2 != 0)); \
 }
 
 /** Add a*b to the number defined by (c0,c1). c1 must never overflow. */
 #define muladd_fast(a,b) { \
     uint32_t tl, th; \
     { \
         uint64_t t = (uint64_t)a * b; \
         th = t >> 32;         /* at most 0xFFFFFFFE */ \
         tl = t; \
     } \
     c0 += tl;                 /* overflow is handled on the next line */ \
     th += (c0 < tl);          /* at most 0xFFFFFFFF */ \
     c1 += th;                 /* never overflows by contract (verified in the next line) */ \
     VERIFY_CHECK(c1 >= th); \
 }
 
 /** Add 2*a*b to the number defined by (c0,c1,c2). c2 must never overflow. */
 #define muladd2(a,b) { \
     uint32_t tl, th, th2, tl2; \
     { \
         uint64_t t = (uint64_t)a * b; \
         th = t >> 32;               /* at most 0xFFFFFFFE */ \
         tl = t; \
     } \
     th2 = th + th;                  /* at most 0xFFFFFFFE (in case th was 0x7FFFFFFF) */ \
     c2 += (th2 < th);               /* never overflows by contract (verified the next line) */ \
     VERIFY_CHECK((th2 >= th) || (c2 != 0)); \
     tl2 = tl + tl;                  /* at most 0xFFFFFFFE (in case the lowest 63 bits of tl were 0x7FFFFFFF) */ \
     th2 += (tl2 < tl);              /* at most 0xFFFFFFFF */ \
     c0 += tl2;                      /* overflow is handled on the next line */ \
     th2 += (c0 < tl2);              /* second overflow is handled on the next line */ \
     c2 += (c0 < tl2) & (th2 == 0);  /* never overflows by contract (verified the next line) */ \
     VERIFY_CHECK((c0 >= tl2) || (th2 != 0) || (c2 != 0)); \
     c1 += th2;                      /* overflow is handled on the next line */ \
     c2 += (c1 < th2);               /* never overflows by contract (verified the next line) */ \
     VERIFY_CHECK((c1 >= th2) || (c2 != 0)); \
 }
 
 /** Add a to the number defined by (c0,c1,c2). c2 must never overflow. */
 #define sumadd(a) { \
     unsigned int over; \
     c0 += (a);                  /* overflow is handled on the next line */ \
     over = (c0 < (a)); \
     c1 += over;                 /* overflow is handled on the next line */ \
     c2 += (c1 < over);          /* never overflows by contract */ \
 }
 
 /** Add a to the number defined by (c0,c1). c1 must never overflow, c2 must be zero. */
 #define sumadd_fast(a) { \
     c0 += (a);                 /* overflow is handled on the next line */ \
     c1 += (c0 < (a));          /* never overflows by contract (verified the next line) */ \
     VERIFY_CHECK((c1 != 0) | (c0 >= (a))); \
     VERIFY_CHECK(c2 == 0); \
 }
 
 /** Extract the lowest 32 bits of (c0,c1,c2) into n, and left shift the number 32 bits. */
 #define extract(n) { \
     (n) = c0; \
     c0 = c1; \
     c1 = c2; \
     c2 = 0; \
 }
 
 /** Extract the lowest 32 bits of (c0,c1,c2) into n, and left shift the number 32 bits. c2 is required to be zero. */
 #define extract_fast(n) { \
     (n) = c0; \
     c0 = c1; \
     c1 = 0; \
     VERIFY_CHECK(c2 == 0); \
 }
 
 static void secp256k1_scalar_reduce_512(secp256k1_scalar *r, const uint32_t *l) {
     uint64_t c;
     uint32_t n0 = l[8], n1 = l[9], n2 = l[10], n3 = l[11], n4 = l[12], n5 = l[13], n6 = l[14], n7 = l[15];
     uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12;
     uint32_t p0, p1, p2, p3, p4, p5, p6, p7, p8;
 
     /* 96 bit accumulator. */
     uint32_t c0, c1, c2;
 
     /* Reduce 512 bits into 385. */
     /* m[0..12] = l[0..7] + n[0..7] * SECP256K1_N_C. */
     c0 = l[0]; c1 = 0; c2 = 0;
     muladd_fast(n0, SECP256K1_N_C_0);
     extract_fast(m0);
     sumadd_fast(l[1]);
     muladd(n1, SECP256K1_N_C_0);
     muladd(n0, SECP256K1_N_C_1);
     extract(m1);
     sumadd(l[2]);
     muladd(n2, SECP256K1_N_C_0);
     muladd(n1, SECP256K1_N_C_1);
     muladd(n0, SECP256K1_N_C_2);
     extract(m2);
     sumadd(l[3]);
     muladd(n3, SECP256K1_N_C_0);
     muladd(n2, SECP256K1_N_C_1);
     muladd(n1, SECP256K1_N_C_2);
     muladd(n0, SECP256K1_N_C_3);
     extract(m3);
     sumadd(l[4]);
     muladd(n4, SECP256K1_N_C_0);
     muladd(n3, SECP256K1_N_C_1);
     muladd(n2, SECP256K1_N_C_2);
     muladd(n1, SECP256K1_N_C_3);
     sumadd(n0);
     extract(m4);
     sumadd(l[5]);
     muladd(n5, SECP256K1_N_C_0);
     muladd(n4, SECP256K1_N_C_1);
     muladd(n3, SECP256K1_N_C_2);
     muladd(n2, SECP256K1_N_C_3);
     sumadd(n1);
     extract(m5);
     sumadd(l[6]);
     muladd(n6, SECP256K1_N_C_0);
     muladd(n5, SECP256K1_N_C_1);
     muladd(n4, SECP256K1_N_C_2);
     muladd(n3, SECP256K1_N_C_3);
     sumadd(n2);
     extract(m6);
     sumadd(l[7]);
     muladd(n7, SECP256K1_N_C_0);
     muladd(n6, SECP256K1_N_C_1);
     muladd(n5, SECP256K1_N_C_2);
     muladd(n4, SECP256K1_N_C_3);
     sumadd(n3);
     extract(m7);
     muladd(n7, SECP256K1_N_C_1);
     muladd(n6, SECP256K1_N_C_2);
     muladd(n5, SECP256K1_N_C_3);
     sumadd(n4);
     extract(m8);
     muladd(n7, SECP256K1_N_C_2);
     muladd(n6, SECP256K1_N_C_3);
     sumadd(n5);
     extract(m9);
     muladd(n7, SECP256K1_N_C_3);
     sumadd(n6);
     extract(m10);
     sumadd_fast(n7);
     extract_fast(m11);
     VERIFY_CHECK(c0 <= 1);
     m12 = c0;
 
     /* Reduce 385 bits into 258. */
     /* p[0..8] = m[0..7] + m[8..12] * SECP256K1_N_C. */
     c0 = m0; c1 = 0; c2 = 0;
     muladd_fast(m8, SECP256K1_N_C_0);
     extract_fast(p0);
     sumadd_fast(m1);
     muladd(m9, SECP256K1_N_C_0);
     muladd(m8, SECP256K1_N_C_1);
     extract(p1);
     sumadd(m2);
     muladd(m10, SECP256K1_N_C_0);
     muladd(m9, SECP256K1_N_C_1);
     muladd(m8, SECP256K1_N_C_2);
     extract(p2);
     sumadd(m3);
     muladd(m11, SECP256K1_N_C_0);
     muladd(m10, SECP256K1_N_C_1);
     muladd(m9, SECP256K1_N_C_2);
     muladd(m8, SECP256K1_N_C_3);
     extract(p3);
     sumadd(m4);
     muladd(m12, SECP256K1_N_C_0);
     muladd(m11, SECP256K1_N_C_1);
     muladd(m10, SECP256K1_N_C_2);
     muladd(m9, SECP256K1_N_C_3);
     sumadd(m8);
     extract(p4);
     sumadd(m5);
     muladd(m12, SECP256K1_N_C_1);
     muladd(m11, SECP256K1_N_C_2);
     muladd(m10, SECP256K1_N_C_3);
     sumadd(m9);
     extract(p5);
     sumadd(m6);
     muladd(m12, SECP256K1_N_C_2);
     muladd(m11, SECP256K1_N_C_3);
     sumadd(m10);
     extract(p6);
     sumadd_fast(m7);
     muladd_fast(m12, SECP256K1_N_C_3);
     sumadd_fast(m11);
     extract_fast(p7);
     p8 = c0 + m12;
     VERIFY_CHECK(p8 <= 2);
 
     /* Reduce 258 bits into 256. */
     /* r[0..7] = p[0..7] + p[8] * SECP256K1_N_C. */
     c = p0 + (uint64_t)SECP256K1_N_C_0 * p8;
     r->d[0] = c & 0xFFFFFFFFUL; c >>= 32;
     c += p1 + (uint64_t)SECP256K1_N_C_1 * p8;
     r->d[1] = c & 0xFFFFFFFFUL; c >>= 32;
     c += p2 + (uint64_t)SECP256K1_N_C_2 * p8;
     r->d[2] = c & 0xFFFFFFFFUL; c >>= 32;
     c += p3 + (uint64_t)SECP256K1_N_C_3 * p8;
     r->d[3] = c & 0xFFFFFFFFUL; c >>= 32;
     c += p4 + (uint64_t)p8;
     r->d[4] = c & 0xFFFFFFFFUL; c >>= 32;
     c += p5;
     r->d[5] = c & 0xFFFFFFFFUL; c >>= 32;
     c += p6;
     r->d[6] = c & 0xFFFFFFFFUL; c >>= 32;
     c += p7;
     r->d[7] = c & 0xFFFFFFFFUL; c >>= 32;
 
     /* Final reduction of r. */
     secp256k1_scalar_reduce(r, c + secp256k1_scalar_check_overflow(r));
 }
 
 static void secp256k1_scalar_mul_512(uint32_t *l, const secp256k1_scalar *a, const secp256k1_scalar *b) {
     /* 96 bit accumulator. */
     uint32_t c0 = 0, c1 = 0, c2 = 0;
 
     /* l[0..15] = a[0..7] * b[0..7]. */
     muladd_fast(a->d[0], b->d[0]);
     extract_fast(l[0]);
     muladd(a->d[0], b->d[1]);
     muladd(a->d[1], b->d[0]);
     extract(l[1]);
     muladd(a->d[0], b->d[2]);
     muladd(a->d[1], b->d[1]);
     muladd(a->d[2], b->d[0]);
     extract(l[2]);
     muladd(a->d[0], b->d[3]);
     muladd(a->d[1], b->d[2]);
     muladd(a->d[2], b->d[1]);
     muladd(a->d[3], b->d[0]);
     extract(l[3]);
     muladd(a->d[0], b->d[4]);
     muladd(a->d[1], b->d[3]);
     muladd(a->d[2], b->d[2]);
     muladd(a->d[3], b->d[1]);
     muladd(a->d[4], b->d[0]);
     extract(l[4]);
     muladd(a->d[0], b->d[5]);
     muladd(a->d[1], b->d[4]);
     muladd(a->d[2], b->d[3]);
     muladd(a->d[3], b->d[2]);
     muladd(a->d[4], b->d[1]);
     muladd(a->d[5], b->d[0]);
     extract(l[5]);
     muladd(a->d[0], b->d[6]);
     muladd(a->d[1], b->d[5]);
     muladd(a->d[2], b->d[4]);
     muladd(a->d[3], b->d[3]);
     muladd(a->d[4], b->d[2]);
     muladd(a->d[5], b->d[1]);
     muladd(a->d[6], b->d[0]);
     extract(l[6]);
     muladd(a->d[0], b->d[7]);
     muladd(a->d[1], b->d[6]);
     muladd(a->d[2], b->d[5]);
     muladd(a->d[3], b->d[4]);
     muladd(a->d[4], b->d[3]);
     muladd(a->d[5], b->d[2]);
     muladd(a->d[6], b->d[1]);
     muladd(a->d[7], b->d[0]);
     extract(l[7]);
     muladd(a->d[1], b->d[7]);
     muladd(a->d[2], b->d[6]);
     muladd(a->d[3], b->d[5]);
     muladd(a->d[4], b->d[4]);
     muladd(a->d[5], b->d[3]);
     muladd(a->d[6], b->d[2]);
     muladd(a->d[7], b->d[1]);
     extract(l[8]);
     muladd(a->d[2], b->d[7]);
     muladd(a->d[3], b->d[6]);
     muladd(a->d[4], b->d[5]);
     muladd(a->d[5], b->d[4]);
     muladd(a->d[6], b->d[3]);
     muladd(a->d[7], b->d[2]);
     extract(l[9]);
     muladd(a->d[3], b->d[7]);
     muladd(a->d[4], b->d[6]);
     muladd(a->d[5], b->d[5]);
     muladd(a->d[6], b->d[4]);
     muladd(a->d[7], b->d[3]);
     extract(l[10]);
     muladd(a->d[4], b->d[7]);
     muladd(a->d[5], b->d[6]);
     muladd(a->d[6], b->d[5]);
     muladd(a->d[7], b->d[4]);
     extract(l[11]);
     muladd(a->d[5], b->d[7]);
     muladd(a->d[6], b->d[6]);
     muladd(a->d[7], b->d[5]);
     extract(l[12]);
     muladd(a->d[6], b->d[7]);
     muladd(a->d[7], b->d[6]);
     extract(l[13]);
     muladd_fast(a->d[7], b->d[7]);
     extract_fast(l[14]);
     VERIFY_CHECK(c1 == 0);
     l[15] = c0;
 }
 
 static void secp256k1_scalar_sqr_512(uint32_t *l, const secp256k1_scalar *a) {
     /* 96 bit accumulator. */
     uint32_t c0 = 0, c1 = 0, c2 = 0;
 
     /* l[0..15] = a[0..7]^2. */
     muladd_fast(a->d[0], a->d[0]);
     extract_fast(l[0]);
     muladd2(a->d[0], a->d[1]);
     extract(l[1]);
     muladd2(a->d[0], a->d[2]);
     muladd(a->d[1], a->d[1]);
     extract(l[2]);
     muladd2(a->d[0], a->d[3]);
     muladd2(a->d[1], a->d[2]);
     extract(l[3]);
     muladd2(a->d[0], a->d[4]);
     muladd2(a->d[1], a->d[3]);
     muladd(a->d[2], a->d[2]);
     extract(l[4]);
     muladd2(a->d[0], a->d[5]);
     muladd2(a->d[1], a->d[4]);
     muladd2(a->d[2], a->d[3]);
     extract(l[5]);
     muladd2(a->d[0], a->d[6]);
     muladd2(a->d[1], a->d[5]);
     muladd2(a->d[2], a->d[4]);
     muladd(a->d[3], a->d[3]);
     extract(l[6]);
     muladd2(a->d[0], a->d[7]);
     muladd2(a->d[1], a->d[6]);
     muladd2(a->d[2], a->d[5]);
     muladd2(a->d[3], a->d[4]);
     extract(l[7]);
     muladd2(a->d[1], a->d[7]);
     muladd2(a->d[2], a->d[6]);
     muladd2(a->d[3], a->d[5]);
     muladd(a->d[4], a->d[4]);
     extract(l[8]);
     muladd2(a->d[2], a->d[7]);
     muladd2(a->d[3], a->d[6]);
     muladd2(a->d[4], a->d[5]);
     extract(l[9]);
     muladd2(a->d[3], a->d[7]);
     muladd2(a->d[4], a->d[6]);
     muladd(a->d[5], a->d[5]);
     extract(l[10]);
     muladd2(a->d[4], a->d[7]);
     muladd2(a->d[5], a->d[6]);
     extract(l[11]);
     muladd2(a->d[5], a->d[7]);
     muladd(a->d[6], a->d[6]);
     extract(l[12]);
     muladd2(a->d[6], a->d[7]);
     extract(l[13]);
     muladd_fast(a->d[7], a->d[7]);
     extract_fast(l[14]);
     VERIFY_CHECK(c1 == 0);
     l[15] = c0;
 }
 
 #undef sumadd
 #undef sumadd_fast
 #undef muladd
 #undef muladd_fast
 #undef muladd2
 #undef extract
 #undef extract_fast
 
 static void secp256k1_scalar_mul(secp256k1_scalar *r, const secp256k1_scalar *a, const secp256k1_scalar *b) {
     uint32_t l[16];
     secp256k1_scalar_mul_512(l, a, b);
     secp256k1_scalar_reduce_512(r, l);
 }
 
 static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n) {
     int ret;
     VERIFY_CHECK(n > 0);
     VERIFY_CHECK(n < 16);
     ret = r->d[0] & ((1 << n) - 1);
     r->d[0] = (r->d[0] >> n) + (r->d[1] << (32 - n));
     r->d[1] = (r->d[1] >> n) + (r->d[2] << (32 - n));
     r->d[2] = (r->d[2] >> n) + (r->d[3] << (32 - n));
     r->d[3] = (r->d[3] >> n) + (r->d[4] << (32 - n));
     r->d[4] = (r->d[4] >> n) + (r->d[5] << (32 - n));
     r->d[5] = (r->d[5] >> n) + (r->d[6] << (32 - n));
     r->d[6] = (r->d[6] >> n) + (r->d[7] << (32 - n));
     r->d[7] = (r->d[7] >> n);
     return ret;
 }
 
 static void secp256k1_scalar_sqr(secp256k1_scalar *r, const secp256k1_scalar *a) {
     uint32_t l[16];
     secp256k1_scalar_sqr_512(l, a);
     secp256k1_scalar_reduce_512(r, l);
 }
 
 static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *k) {
     r1->d[0] = k->d[0];
     r1->d[1] = k->d[1];
     r1->d[2] = k->d[2];
     r1->d[3] = k->d[3];
     r1->d[4] = 0;
     r1->d[5] = 0;
     r1->d[6] = 0;
     r1->d[7] = 0;
     r2->d[0] = k->d[4];
     r2->d[1] = k->d[5];
     r2->d[2] = k->d[6];
     r2->d[3] = k->d[7];
     r2->d[4] = 0;
     r2->d[5] = 0;
     r2->d[6] = 0;
     r2->d[7] = 0;
 }
 
 SECP256K1_INLINE static int secp256k1_scalar_eq(const secp256k1_scalar *a, const secp256k1_scalar *b) {
     return ((a->d[0] ^ b->d[0]) | (a->d[1] ^ b->d[1]) | (a->d[2] ^ b->d[2]) | (a->d[3] ^ b->d[3]) | (a->d[4] ^ b->d[4]) | (a->d[5] ^ b->d[5]) | (a->d[6] ^ b->d[6]) | (a->d[7] ^ b->d[7])) == 0;
 }
 
 SECP256K1_INLINE static void secp256k1_scalar_mul_shift_var(secp256k1_scalar *r, const secp256k1_scalar *a, const secp256k1_scalar *b, unsigned int shift) {
     uint32_t l[16];
     unsigned int shiftlimbs;
     unsigned int shiftlow;
     unsigned int shifthigh;
     VERIFY_CHECK(shift >= 256);
     secp256k1_scalar_mul_512(l, a, b);
     shiftlimbs = shift >> 5;
     shiftlow = shift & 0x1F;
     shifthigh = 32 - shiftlow;
     r->d[0] = shift < 512 ? (l[0 + shiftlimbs] >> shiftlow | (shift < 480 && shiftlow ? (l[1 + shiftlimbs] << shifthigh) : 0)) : 0;
     r->d[1] = shift < 480 ? (l[1 + shiftlimbs] >> shiftlow | (shift < 448 && shiftlow ? (l[2 + shiftlimbs] << shifthigh) : 0)) : 0;
     r->d[2] = shift < 448 ? (l[2 + shiftlimbs] >> shiftlow | (shift < 416 && shiftlow ? (l[3 + shiftlimbs] << shifthigh) : 0)) : 0;
     r->d[3] = shift < 416 ? (l[3 + shiftlimbs] >> shiftlow | (shift < 384 && shiftlow ? (l[4 + shiftlimbs] << shifthigh) : 0)) : 0;
     r->d[4] = shift < 384 ? (l[4 + shiftlimbs] >> shiftlow | (shift < 352 && shiftlow ? (l[5 + shiftlimbs] << shifthigh) : 0)) : 0;
     r->d[5] = shift < 352 ? (l[5 + shiftlimbs] >> shiftlow | (shift < 320 && shiftlow ? (l[6 + shiftlimbs] << shifthigh) : 0)) : 0;
     r->d[6] = shift < 320 ? (l[6 + shiftlimbs] >> shiftlow | (shift < 288 && shiftlow ? (l[7 + shiftlimbs] << shifthigh) : 0)) : 0;
     r->d[7] = shift < 288 ? (l[7 + shiftlimbs] >> shiftlow)  : 0;
     secp256k1_scalar_cadd_bit(r, 0, (l[(shift - 1) >> 5] >> ((shift - 1) & 0x1f)) & 1);
 }
 
 static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const secp256k1_scalar *a, int flag) {
     uint32_t mask0, mask1;
     VG_CHECK_VERIFY(r->d, sizeof(r->d));
     mask0 = flag + ~((uint32_t)0);
     mask1 = ~mask0;
     r->d[0] = (r->d[0] & mask0) | (a->d[0] & mask1);
     r->d[1] = (r->d[1] & mask0) | (a->d[1] & mask1);
     r->d[2] = (r->d[2] & mask0) | (a->d[2] & mask1);
     r->d[3] = (r->d[3] & mask0) | (a->d[3] & mask1);
     r->d[4] = (r->d[4] & mask0) | (a->d[4] & mask1);
     r->d[5] = (r->d[5] & mask0) | (a->d[5] & mask1);
     r->d[6] = (r->d[6] & mask0) | (a->d[6] & mask1);
     r->d[7] = (r->d[7] & mask0) | (a->d[7] & mask1);
 }
 
+static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
+    secp256k1_scalar *t;
+    int i;
+    /* First compute xN as x ^ (2^N - 1) for some values of N,
+     * and uM as x ^ M for some values of M. */
+    secp256k1_scalar x2, x3, x6, x8, x14, x28, x56, x112, x126;
+    secp256k1_scalar u2, u5, u9, u11, u13;
+
+    secp256k1_scalar_sqr(&u2, x);
+    secp256k1_scalar_mul(&x2, &u2,  x);
+    secp256k1_scalar_mul(&u5, &u2, &x2);
+    secp256k1_scalar_mul(&x3, &u5,  &u2);
+    secp256k1_scalar_mul(&u9, &x3, &u2);
+    secp256k1_scalar_mul(&u11, &u9, &u2);
+    secp256k1_scalar_mul(&u13, &u11, &u2);
+
+    secp256k1_scalar_sqr(&x6, &u13);
+    secp256k1_scalar_sqr(&x6, &x6);
+    secp256k1_scalar_mul(&x6, &x6, &u11);
+
+    secp256k1_scalar_sqr(&x8, &x6);
+    secp256k1_scalar_sqr(&x8, &x8);
+    secp256k1_scalar_mul(&x8, &x8,  &x2);
+
+    secp256k1_scalar_sqr(&x14, &x8);
+    for (i = 0; i < 5; i++) {
+        secp256k1_scalar_sqr(&x14, &x14);
+    }
+    secp256k1_scalar_mul(&x14, &x14, &x6);
+
+    secp256k1_scalar_sqr(&x28, &x14);
+    for (i = 0; i < 13; i++) {
+        secp256k1_scalar_sqr(&x28, &x28);
+    }
+    secp256k1_scalar_mul(&x28, &x28, &x14);
+
+    secp256k1_scalar_sqr(&x56, &x28);
+    for (i = 0; i < 27; i++) {
+        secp256k1_scalar_sqr(&x56, &x56);
+    }
+    secp256k1_scalar_mul(&x56, &x56, &x28);
+
+    secp256k1_scalar_sqr(&x112, &x56);
+    for (i = 0; i < 55; i++) {
+        secp256k1_scalar_sqr(&x112, &x112);
+    }
+    secp256k1_scalar_mul(&x112, &x112, &x56);
+
+    secp256k1_scalar_sqr(&x126, &x112);
+    for (i = 0; i < 13; i++) {
+        secp256k1_scalar_sqr(&x126, &x126);
+    }
+    secp256k1_scalar_mul(&x126, &x126, &x14);
+
+    /* Then accumulate the final result (t starts at x126). */
+    t = &x126;
+    for (i = 0; i < 3; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 5; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
+    for (i = 0; i < 4; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 5; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 6; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 3; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 5; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
+    for (i = 0; i < 6; i++) { /* 000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u5); /* 101 */
+    for (i = 0; i < 10; i++) { /* 0000000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 4; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x3); /* 111 */
+    for (i = 0; i < 9; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x8); /* 11111111 */
+    for (i = 0; i < 5; i++) { /* 0 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
+    for (i = 0; i < 6; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
+    for (i = 0; i < 4; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 5; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &x2); /* 11 */
+    for (i = 0; i < 6; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 10; i++) { /* 000000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
+    for (i = 0; i < 4; i++) {
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
+    for (i = 0; i < 6; i++) { /* 00000 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(t, t, x); /* 1 */
+    for (i = 0; i < 8; i++) { /* 00 */
+        secp256k1_scalar_sqr(t, t);
+    }
+    secp256k1_scalar_mul(r, t, &x6); /* 111111 */
+}
+
+static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
+#if defined(USE_SCALAR_INV_BUILTIN)
+    secp256k1_scalar_inverse(r, x);
+#elif defined(USE_SCALAR_INV_NUM)
+    unsigned char b[32];
+    secp256k1_num n, m;
+    secp256k1_scalar t = *x;
+    secp256k1_scalar_get_b32(b, &t);
+    secp256k1_num_set_bin(&n, b, 32);
+    secp256k1_scalar_order_get_num(&m);
+    secp256k1_num_mod_inverse(&n, &n, &m);
+    secp256k1_num_get_bin(b, 32, &n);
+    secp256k1_scalar_set_b32(r, b, NULL);
+    /* Verify that the inverse was computed correctly, without GMP code. */
+    secp256k1_scalar_mul(&t, &t, r);
+    CHECK(secp256k1_scalar_is_one(&t));
+#else
+#error "Please select scalar inverse implementation"
+#endif
+}
+
+SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
+    return !(a->d[0] & 1);
+}
+
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */
diff --git a/src/secp256k1/src/scalar_impl.h b/src/secp256k1/src/scalar_impl.h
index 61c1fbd58..b328afdb9 100644
--- a/src/secp256k1/src/scalar_impl.h
+++ b/src/secp256k1/src/scalar_impl.h
@@ -1,516 +1,325 @@
 /***********************************************************************
  * Copyright (c) 2014 Pieter Wuille                                    *
  * Distributed under the MIT software license, see the accompanying    *
  * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
  ***********************************************************************/
 
 #ifndef SECP256K1_SCALAR_IMPL_H
 #define SECP256K1_SCALAR_IMPL_H
 
 #ifdef VERIFY
 #include <string.h>
 #endif
 
 #include "scalar.h"
 #include "util.h"
 
 #if defined HAVE_CONFIG_H
 #include "libsecp256k1-config.h"
 #endif
 
 #if defined(EXHAUSTIVE_TEST_ORDER)
 #include "scalar_low_impl.h"
 #elif defined(SECP256K1_WIDEMUL_INT128)
 #include "scalar_4x64_impl.h"
 #elif defined(SECP256K1_WIDEMUL_INT64)
 #include "scalar_8x32_impl.h"
 #else
 #error "Please select wide multiplication implementation"
 #endif
 
 static const secp256k1_scalar secp256k1_scalar_one = SECP256K1_SCALAR_CONST(0, 0, 0, 0, 0, 0, 0, 1);
 static const secp256k1_scalar secp256k1_scalar_zero = SECP256K1_SCALAR_CONST(0, 0, 0, 0, 0, 0, 0, 0);
 
 #ifndef USE_NUM_NONE
 static void secp256k1_scalar_get_num(secp256k1_num *r, const secp256k1_scalar *a) {
     unsigned char c[32];
     secp256k1_scalar_get_b32(c, a);
     secp256k1_num_set_bin(r, c, 32);
 }
 
 /** secp256k1 curve order, see secp256k1_ecdsa_const_order_as_fe in ecdsa_impl.h */
 static void secp256k1_scalar_order_get_num(secp256k1_num *r) {
 #if defined(EXHAUSTIVE_TEST_ORDER)
     static const unsigned char order[32] = {
         0,0,0,0,0,0,0,0,
         0,0,0,0,0,0,0,0,
         0,0,0,0,0,0,0,0,
         0,0,0,0,0,0,0,EXHAUSTIVE_TEST_ORDER
     };
 #else
     static const unsigned char order[32] = {
         0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,
         0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFE,
         0xBA,0xAE,0xDC,0xE6,0xAF,0x48,0xA0,0x3B,
         0xBF,0xD2,0x5E,0x8C,0xD0,0x36,0x41,0x41
     };
 #endif
     secp256k1_num_set_bin(r, order, 32);
 }
 #endif
 
 static int secp256k1_scalar_set_b32_seckey(secp256k1_scalar *r, const unsigned char *bin) {
     int overflow;
     secp256k1_scalar_set_b32(r, bin, &overflow);
     return (!overflow) & (!secp256k1_scalar_is_zero(r));
 }
 
-static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
-#if defined(EXHAUSTIVE_TEST_ORDER)
-    int i;
-    *r = 0;
-    for (i = 0; i < EXHAUSTIVE_TEST_ORDER; i++)
-        if ((i * *x) % EXHAUSTIVE_TEST_ORDER == 1)
-            *r = i;
-    /* If this VERIFY_CHECK triggers we were given a noninvertible scalar (and thus
-     * have a composite group order; fix it in exhaustive_tests.c). */
-    VERIFY_CHECK(*r != 0);
-}
-#else
-    secp256k1_scalar *t;
-    int i;
-    /* First compute xN as x ^ (2^N - 1) for some values of N,
-     * and uM as x ^ M for some values of M. */
-    secp256k1_scalar x2, x3, x6, x8, x14, x28, x56, x112, x126;
-    secp256k1_scalar u2, u5, u9, u11, u13;
-
-    secp256k1_scalar_sqr(&u2, x);
-    secp256k1_scalar_mul(&x2, &u2,  x);
-    secp256k1_scalar_mul(&u5, &u2, &x2);
-    secp256k1_scalar_mul(&x3, &u5,  &u2);
-    secp256k1_scalar_mul(&u9, &x3, &u2);
-    secp256k1_scalar_mul(&u11, &u9, &u2);
-    secp256k1_scalar_mul(&u13, &u11, &u2);
-
-    secp256k1_scalar_sqr(&x6, &u13);
-    secp256k1_scalar_sqr(&x6, &x6);
-    secp256k1_scalar_mul(&x6, &x6, &u11);
-
-    secp256k1_scalar_sqr(&x8, &x6);
-    secp256k1_scalar_sqr(&x8, &x8);
-    secp256k1_scalar_mul(&x8, &x8,  &x2);
-
-    secp256k1_scalar_sqr(&x14, &x8);
-    for (i = 0; i < 5; i++) {
-        secp256k1_scalar_sqr(&x14, &x14);
-    }
-    secp256k1_scalar_mul(&x14, &x14, &x6);
-
-    secp256k1_scalar_sqr(&x28, &x14);
-    for (i = 0; i < 13; i++) {
-        secp256k1_scalar_sqr(&x28, &x28);
-    }
-    secp256k1_scalar_mul(&x28, &x28, &x14);
-
-    secp256k1_scalar_sqr(&x56, &x28);
-    for (i = 0; i < 27; i++) {
-        secp256k1_scalar_sqr(&x56, &x56);
-    }
-    secp256k1_scalar_mul(&x56, &x56, &x28);
-
-    secp256k1_scalar_sqr(&x112, &x56);
-    for (i = 0; i < 55; i++) {
-        secp256k1_scalar_sqr(&x112, &x112);
-    }
-    secp256k1_scalar_mul(&x112, &x112, &x56);
-
-    secp256k1_scalar_sqr(&x126, &x112);
-    for (i = 0; i < 13; i++) {
-        secp256k1_scalar_sqr(&x126, &x126);
-    }
-    secp256k1_scalar_mul(&x126, &x126, &x14);
-
-    /* Then accumulate the final result (t starts at x126). */
-    t = &x126;
-    for (i = 0; i < 3; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 5; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
-    for (i = 0; i < 4; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 5; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 6; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 3; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 5; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
-    for (i = 0; i < 6; i++) { /* 000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u5); /* 101 */
-    for (i = 0; i < 10; i++) { /* 0000000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 4; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x3); /* 111 */
-    for (i = 0; i < 9; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x8); /* 11111111 */
-    for (i = 0; i < 5; i++) { /* 0 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
-    for (i = 0; i < 6; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u11); /* 1011 */
-    for (i = 0; i < 4; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 5; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &x2); /* 11 */
-    for (i = 0; i < 6; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 10; i++) { /* 000000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u13); /* 1101 */
-    for (i = 0; i < 4; i++) {
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, &u9); /* 1001 */
-    for (i = 0; i < 6; i++) { /* 00000 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(t, t, x); /* 1 */
-    for (i = 0; i < 8; i++) { /* 00 */
-        secp256k1_scalar_sqr(t, t);
-    }
-    secp256k1_scalar_mul(r, t, &x6); /* 111111 */
-}
-
-SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
-    return !(a->d[0] & 1);
-}
-#endif
-
-static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
-#if defined(USE_SCALAR_INV_BUILTIN)
-    secp256k1_scalar_inverse(r, x);
-#elif defined(USE_SCALAR_INV_NUM)
-    unsigned char b[32];
-    secp256k1_num n, m;
-    secp256k1_scalar t = *x;
-    secp256k1_scalar_get_b32(b, &t);
-    secp256k1_num_set_bin(&n, b, 32);
-    secp256k1_scalar_order_get_num(&m);
-    secp256k1_num_mod_inverse(&n, &n, &m);
-    secp256k1_num_get_bin(b, 32, &n);
-    secp256k1_scalar_set_b32(r, b, NULL);
-    /* Verify that the inverse was computed correctly, without GMP code. */
-    secp256k1_scalar_mul(&t, &t, r);
-    CHECK(secp256k1_scalar_is_one(&t));
-#else
-#error "Please select scalar inverse implementation"
-#endif
-}
-
 /* These parameters are generated using sage/gen_exhaustive_groups.sage. */
 #if defined(EXHAUSTIVE_TEST_ORDER)
 #  if EXHAUSTIVE_TEST_ORDER == 13
 #    define EXHAUSTIVE_TEST_LAMBDA 9
 #  elif EXHAUSTIVE_TEST_ORDER == 199
 #    define EXHAUSTIVE_TEST_LAMBDA 92
 #  else
 #    error No known lambda for the specified exhaustive test group order.
 #  endif
 
 /**
  * Find r1 and r2 given k, such that r1 + r2 * lambda == k mod n; unlike in the
  * full case we don't bother making r1 and r2 be small, we just want them to be
  * nontrivial to get full test coverage for the exhaustive tests. We therefore
  * (arbitrarily) set r2 = k + 5 (mod n) and r1 = k - r2 * lambda (mod n).
  */
 static void secp256k1_scalar_split_lambda(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *k) {
     *r2 = (*k + 5) % EXHAUSTIVE_TEST_ORDER;
     *r1 = (*k + (EXHAUSTIVE_TEST_ORDER - *r2) * EXHAUSTIVE_TEST_LAMBDA) % EXHAUSTIVE_TEST_ORDER;
 }
 #else
 /**
  * The Secp256k1 curve has an endomorphism, where lambda * (x, y) = (beta * x, y), where
  * lambda is: */
 static const secp256k1_scalar secp256k1_const_lambda = SECP256K1_SCALAR_CONST(
     0x5363AD4CUL, 0xC05C30E0UL, 0xA5261C02UL, 0x8812645AUL,
     0x122E22EAUL, 0x20816678UL, 0xDF02967CUL, 0x1B23BD72UL
 );
 
 #ifdef VERIFY
 static void secp256k1_scalar_split_lambda_verify(const secp256k1_scalar *r1, const secp256k1_scalar *r2, const secp256k1_scalar *k);
 #endif
 
 /*
  * Both lambda and beta are primitive cube roots of unity.  That is lamba^3 == 1 mod n and
  * beta^3 == 1 mod p, where n is the curve order and p is the field order.
  *
  * Futhermore, because (X^3 - 1) = (X - 1)(X^2 + X + 1), the primitive cube roots of unity are
  * roots of X^2 + X + 1.  Therefore lambda^2 + lamba == -1 mod n and beta^2 + beta == -1 mod p.
  * (The other primitive cube roots of unity are lambda^2 and beta^2 respectively.)
  *
  * Let l = -1/2 + i*sqrt(3)/2, the complex root of X^2 + X + 1. We can define a ring
  * homomorphism phi : Z[l] -> Z_n where phi(a + b*l) == a + b*lambda mod n. The kernel of phi
  * is a lattice over Z[l] (considering Z[l] as a Z-module). This lattice is generated by a
  * reduced basis {a1 + b1*l, a2 + b2*l} where
  *
  * - a1 =      {0x30,0x86,0xd2,0x21,0xa7,0xd4,0x6b,0xcd,0xe8,0x6c,0x90,0xe4,0x92,0x84,0xeb,0x15}
  * - b1 =     -{0xe4,0x43,0x7e,0xd6,0x01,0x0e,0x88,0x28,0x6f,0x54,0x7f,0xa9,0x0a,0xbf,0xe4,0xc3}
  * - a2 = {0x01,0x14,0xca,0x50,0xf7,0xa8,0xe2,0xf3,0xf6,0x57,0xc1,0x10,0x8d,0x9d,0x44,0xcf,0xd8}
  * - b2 =      {0x30,0x86,0xd2,0x21,0xa7,0xd4,0x6b,0xcd,0xe8,0x6c,0x90,0xe4,0x92,0x84,0xeb,0x15}
  *
  * "Guide to Elliptic Curve Cryptography" (Hankerson, Menezes, Vanstone) gives an algorithm
  * (algorithm 3.74) to find k1 and k2 given k, such that k1 + k2 * lambda == k mod n, and k1
  * and k2 are small in absolute value.
  *
  * The algorithm computes c1 = round(b2 * k / n) and c2 = round((-b1) * k / n), and gives
  * k1 = k - (c1*a1 + c2*a2) and k2 = -(c1*b1 + c2*b2). Instead, we use modular arithmetic, and
  * compute r2 = k2 mod n, and r1 = k1 mod n = (k - r2 * lambda) mod n, avoiding the need for
  * the constants a1 and a2.
  *
  * g1, g2 are precomputed constants used to replace division with a rounded multiplication
  * when decomposing the scalar for an endomorphism-based point multiplication.
  *
  * The possibility of using precomputed estimates is mentioned in "Guide to Elliptic Curve
  * Cryptography" (Hankerson, Menezes, Vanstone) in section 3.5.
  *
  * The derivation is described in the paper "Efficient Software Implementation of Public-Key
  * Cryptography on Sensor Networks Using the MSP430X Microcontroller" (Gouvea, Oliveira, Lopez),
  * Section 4.3 (here we use a somewhat higher-precision estimate):
  * d = a1*b2 - b1*a2
  * g1 = round(2^384 * b2/d)
  * g2 = round(2^384 * (-b1)/d)
  *
  * (Note that d is also equal to the curve order, n, here because [a1,b1] and [a2,b2]
  * can be found as outputs of the Extended Euclidean Algorithm on inputs n and lambda).
  *
  * The function below splits k into r1 and r2, such that
  * - r1 + lambda * r2 == k (mod n)
  * - either r1 < 2^128 or -r1 mod n < 2^128
  * - either r2 < 2^128 or -r2 mod n < 2^128
  *
  * See proof below.
  */
 static void secp256k1_scalar_split_lambda(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *k) {
     secp256k1_scalar c1, c2;
     static const secp256k1_scalar minus_b1 = SECP256K1_SCALAR_CONST(
         0x00000000UL, 0x00000000UL, 0x00000000UL, 0x00000000UL,
         0xE4437ED6UL, 0x010E8828UL, 0x6F547FA9UL, 0x0ABFE4C3UL
     );
     static const secp256k1_scalar minus_b2 = SECP256K1_SCALAR_CONST(
         0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFFUL, 0xFFFFFFFEUL,
         0x8A280AC5UL, 0x0774346DUL, 0xD765CDA8UL, 0x3DB1562CUL
     );
     static const secp256k1_scalar g1 = SECP256K1_SCALAR_CONST(
         0x3086D221UL, 0xA7D46BCDUL, 0xE86C90E4UL, 0x9284EB15UL,
         0x3DAA8A14UL, 0x71E8CA7FUL, 0xE893209AUL, 0x45DBB031UL
     );
     static const secp256k1_scalar g2 = SECP256K1_SCALAR_CONST(
         0xE4437ED6UL, 0x010E8828UL, 0x6F547FA9UL, 0x0ABFE4C4UL,
         0x221208ACUL, 0x9DF506C6UL, 0x1571B4AEUL, 0x8AC47F71UL
     );
     VERIFY_CHECK(r1 != k);
     VERIFY_CHECK(r2 != k);
     /* these _var calls are constant time since the shift amount is constant */
     secp256k1_scalar_mul_shift_var(&c1, k, &g1, 384);
     secp256k1_scalar_mul_shift_var(&c2, k, &g2, 384);
     secp256k1_scalar_mul(&c1, &c1, &minus_b1);
     secp256k1_scalar_mul(&c2, &c2, &minus_b2);
     secp256k1_scalar_add(r2, &c1, &c2);
     secp256k1_scalar_mul(r1, r2, &secp256k1_const_lambda);
     secp256k1_scalar_negate(r1, r1);
     secp256k1_scalar_add(r1, r1, k);
 
 #ifdef VERIFY
     secp256k1_scalar_split_lambda_verify(r1, r2, k);
 #endif
 }
 
 #ifdef VERIFY
 /*
  * Proof for secp256k1_scalar_split_lambda's bounds.
  *
  * Let
  *  - epsilon1 = 2^256 * |g1/2^384 - b2/d|
  *  - epsilon2 = 2^256 * |g2/2^384 - (-b1)/d|
  *  - c1 = round(k*g1/2^384)
  *  - c2 = round(k*g2/2^384)
  *
  * Lemma 1: |c1 - k*b2/d| < 2^-1 + epsilon1
  *
  *    |c1 - k*b2/d|
  *  =
  *    |c1 - k*g1/2^384 + k*g1/2^384 - k*b2/d|
  * <=   {triangle inequality}
  *    |c1 - k*g1/2^384| + |k*g1/2^384 - k*b2/d|
  *  =
  *    |c1 - k*g1/2^384| + k*|g1/2^384 - b2/d|
  * <    {rounding in c1 and 0 <= k < 2^256}
  *    2^-1 + 2^256 * |g1/2^384 - b2/d|
  *  =   {definition of epsilon1}
  *    2^-1 + epsilon1
  *
  * Lemma 2: |c2 - k*(-b1)/d| < 2^-1 + epsilon2
  *
  *    |c2 - k*(-b1)/d|
  *  =
  *    |c2 - k*g2/2^384 + k*g2/2^384 - k*(-b1)/d|
  * <=   {triangle inequality}
  *    |c2 - k*g2/2^384| + |k*g2/2^384 - k*(-b1)/d|
  *  =
  *    |c2 - k*g2/2^384| + k*|g2/2^384 - (-b1)/d|
  * <    {rounding in c2 and 0 <= k < 2^256}
  *    2^-1 + 2^256 * |g2/2^384 - (-b1)/d|
  *  =   {definition of epsilon2}
  *    2^-1 + epsilon2
  *
  * Let
  *  - k1 = k - c1*a1 - c2*a2
  *  - k2 = - c1*b1 - c2*b2
  *
  * Lemma 3: |k1| < (a1 + a2 + 1)/2 < 2^128
  *
  *    |k1|
  *  =   {definition of k1}
  *    |k - c1*a1 - c2*a2|
  *  =   {(a1*b2 - b1*a2)/n = 1}
  *    |k*(a1*b2 - b1*a2)/n - c1*a1 - c2*a2|
  *  =
  *    |a1*(k*b2/n - c1) + a2*(k*(-b1)/n - c2)|
  * <=   {triangle inequality}
  *    a1*|k*b2/n - c1| + a2*|k*(-b1)/n - c2|
  * <    {Lemma 1 and Lemma 2}
  *    a1*(2^-1 + epslion1) + a2*(2^-1 + epsilon2)
  * <    {rounding up to an integer}
  *    (a1 + a2 + 1)/2
  * <    {rounding up to a power of 2}
  *    2^128
  *
  * Lemma 4: |k2| < (-b1 + b2)/2 + 1 < 2^128
  *
  *    |k2|
  *  =   {definition of k2}
  *    |- c1*a1 - c2*a2|
  *  =   {(b1*b2 - b1*b2)/n = 0}
  *    |k*(b1*b2 - b1*b2)/n - c1*b1 - c2*b2|
  *  =
  *    |b1*(k*b2/n - c1) + b2*(k*(-b1)/n - c2)|
  * <=   {triangle inequality}
  *    (-b1)*|k*b2/n - c1| + b2*|k*(-b1)/n - c2|
  * <    {Lemma 1 and Lemma 2}
  *    (-b1)*(2^-1 + epslion1) + b2*(2^-1 + epsilon2)
  * <    {rounding up to an integer}
  *    (-b1 + b2)/2 + 1
  * <    {rounding up to a power of 2}
  *    2^128
  *
  * Let
  *  - r2 = k2 mod n
  *  - r1 = k - r2*lambda mod n.
  *
  * Notice that r1 is defined such that r1 + r2 * lambda == k (mod n).
  *
  * Lemma 5: r1 == k1 mod n.
  *
  *    r1
  * ==   {definition of r1 and r2}
  *    k - k2*lambda
  * ==   {definition of k2}
  *    k - (- c1*b1 - c2*b2)*lambda
  * ==
  *    k + c1*b1*lambda + c2*b2*lambda
  * ==  {a1 + b1*lambda == 0 mod n and a2 + b2*lambda == 0 mod n}
  *    k - c1*a1 - c2*a2
  * ==  {definition of k1}
  *    k1
  *
  * From Lemma 3, Lemma 4, Lemma 5 and the definition of r2, we can conclude that
  *
  *  - either r1 < 2^128 or -r1 mod n < 2^128
  *  - either r2 < 2^128 or -r2 mod n < 2^128.
  *
  * Q.E.D.
  */
 static void secp256k1_scalar_split_lambda_verify(const secp256k1_scalar *r1, const secp256k1_scalar *r2, const secp256k1_scalar *k) {
     secp256k1_scalar s;
     unsigned char buf1[32];
     unsigned char buf2[32];
 
     /* (a1 + a2 + 1)/2 is 0xa2a8918ca85bafe22016d0b917e4dd77 */
     static const unsigned char k1_bound[32] = {
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0xa2, 0xa8, 0x91, 0x8c, 0xa8, 0x5b, 0xaf, 0xe2, 0x20, 0x16, 0xd0, 0xb9, 0x17, 0xe4, 0xdd, 0x77
     };
 
     /* (-b1 + b2)/2 + 1 is 0x8a65287bd47179fb2be08846cea267ed */
     static const unsigned char k2_bound[32] = {
         0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
         0x8a, 0x65, 0x28, 0x7b, 0xd4, 0x71, 0x79, 0xfb, 0x2b, 0xe0, 0x88, 0x46, 0xce, 0xa2, 0x67, 0xed
     };
 
     secp256k1_scalar_mul(&s, &secp256k1_const_lambda, r2);
     secp256k1_scalar_add(&s, &s, r1);
     VERIFY_CHECK(secp256k1_scalar_eq(&s, k));
 
     secp256k1_scalar_negate(&s, r1);
     secp256k1_scalar_get_b32(buf1, r1);
     secp256k1_scalar_get_b32(buf2, &s);
     VERIFY_CHECK(secp256k1_memcmp_var(buf1, k1_bound, 32) < 0 || secp256k1_memcmp_var(buf2, k1_bound, 32) < 0);
 
     secp256k1_scalar_negate(&s, r2);
     secp256k1_scalar_get_b32(buf1, r2);
     secp256k1_scalar_get_b32(buf2, &s);
     VERIFY_CHECK(secp256k1_memcmp_var(buf1, k2_bound, 32) < 0 || secp256k1_memcmp_var(buf2, k2_bound, 32) < 0);
 }
 #endif /* VERIFY */
 #endif /* !defined(EXHAUSTIVE_TEST_ORDER) */
 
 #endif /* SECP256K1_SCALAR_IMPL_H */
diff --git a/src/secp256k1/src/scalar_low_impl.h b/src/secp256k1/src/scalar_low_impl.h
index 98ffd1536..eff270720 100644
--- a/src/secp256k1/src/scalar_low_impl.h
+++ b/src/secp256k1/src/scalar_low_impl.h
@@ -1,128 +1,143 @@
 /***********************************************************************
  * Copyright (c) 2015 Andrew Poelstra                                  *
  * Distributed under the MIT software license, see the accompanying    *
  * file COPYING or https://www.opensource.org/licenses/mit-license.php.*
  ***********************************************************************/
 
 #ifndef SECP256K1_SCALAR_REPR_IMPL_H
 #define SECP256K1_SCALAR_REPR_IMPL_H
 
 #include "scalar.h"
 
 #include <string.h>
 
 SECP256K1_INLINE static int secp256k1_scalar_is_even(const secp256k1_scalar *a) {
     return !(*a & 1);
 }
 
 SECP256K1_INLINE static void secp256k1_scalar_clear(secp256k1_scalar *r) { *r = 0; }
 SECP256K1_INLINE static void secp256k1_scalar_set_int(secp256k1_scalar *r, unsigned int v) { *r = v; }
 
 SECP256K1_INLINE static unsigned int secp256k1_scalar_get_bits(const secp256k1_scalar *a, unsigned int offset, unsigned int count) {
     if (offset < 32)
         return ((*a >> offset) & ((((uint32_t)1) << count) - 1));
     else
         return 0;
 }
 
 SECP256K1_INLINE static unsigned int secp256k1_scalar_get_bits_var(const secp256k1_scalar *a, unsigned int offset, unsigned int count) {
     return secp256k1_scalar_get_bits(a, offset, count);
 }
 
 SECP256K1_INLINE static int secp256k1_scalar_check_overflow(const secp256k1_scalar *a) { return *a >= EXHAUSTIVE_TEST_ORDER; }
 
 static int secp256k1_scalar_add(secp256k1_scalar *r, const secp256k1_scalar *a, const secp256k1_scalar *b) {
     *r = (*a + *b) % EXHAUSTIVE_TEST_ORDER;
     return *r < *b;
 }
 
 static void secp256k1_scalar_cadd_bit(secp256k1_scalar *r, unsigned int bit, int flag) {
     if (flag && bit < 32)
         *r += ((uint32_t)1 << bit);
 #ifdef VERIFY
     VERIFY_CHECK(bit < 32);
     /* Verify that adding (1 << bit) will not overflow any in-range scalar *r by overflowing the underlying uint32_t. */
     VERIFY_CHECK(((uint32_t)1 << bit) - 1 <= UINT32_MAX - EXHAUSTIVE_TEST_ORDER);
     VERIFY_CHECK(secp256k1_scalar_check_overflow(r) == 0);
 #endif
 }
 
 static void secp256k1_scalar_set_b32(secp256k1_scalar *r, const unsigned char *b32, int *overflow) {
     int i;
     int over = 0;
     *r = 0;
     for (i = 0; i < 32; i++) {
         *r = (*r * 0x100) + b32[i];
         if (*r >= EXHAUSTIVE_TEST_ORDER) {
             over = 1;
             *r %= EXHAUSTIVE_TEST_ORDER;
         }
     }
     if (overflow) *overflow = over;
 }
 
 static void secp256k1_scalar_get_b32(unsigned char *bin, const secp256k1_scalar* a) {
     memset(bin, 0, 32);
     bin[28] = *a >> 24; bin[29] = *a >> 16; bin[30] = *a >> 8; bin[31] = *a;
 }
 
 SECP256K1_INLINE static int secp256k1_scalar_is_zero(const secp256k1_scalar *a) {
     return *a == 0;
 }
 
 static void secp256k1_scalar_negate(secp256k1_scalar *r, const secp256k1_scalar *a) {
     if (*a == 0) {
         *r = 0;
     } else {
         *r = EXHAUSTIVE_TEST_ORDER - *a;
     }
 }
 
 SECP256K1_INLINE static int secp256k1_scalar_is_one(const secp256k1_scalar *a) {
     return *a == 1;
 }
 
 static int secp256k1_scalar_is_high(const secp256k1_scalar *a) {
     return *a > EXHAUSTIVE_TEST_ORDER / 2;
 }
 
 static int secp256k1_scalar_cond_negate(secp256k1_scalar *r, int flag) {
     if (flag) secp256k1_scalar_negate(r, r);
     return flag ? -1 : 1;
 }
 
 static void secp256k1_scalar_mul(secp256k1_scalar *r, const secp256k1_scalar *a, const secp256k1_scalar *b) {
     *r = (*a * *b) % EXHAUSTIVE_TEST_ORDER;
 }
 
 static int secp256k1_scalar_shr_int(secp256k1_scalar *r, int n) {
     int ret;
     VERIFY_CHECK(n > 0);
     VERIFY_CHECK(n < 16);
     ret = *r & ((1 << n) - 1);
     *r >>= n;
     return ret;
 }
 
 static void secp256k1_scalar_sqr(secp256k1_scalar *r, const secp256k1_scalar *a) {
     *r = (*a * *a) % EXHAUSTIVE_TEST_ORDER;
 }
 
 static void secp256k1_scalar_split_128(secp256k1_scalar *r1, secp256k1_scalar *r2, const secp256k1_scalar *a) {
     *r1 = *a;
     *r2 = 0;
 }
 
 SECP256K1_INLINE static int secp256k1_scalar_eq(const secp256k1_scalar *a, const secp256k1_scalar *b) {
     return *a == *b;
 }
 
 static SECP256K1_INLINE void secp256k1_scalar_cmov(secp256k1_scalar *r, const secp256k1_scalar *a, int flag) {
     uint32_t mask0, mask1;
     VG_CHECK_VERIFY(r, sizeof(*r));
     mask0 = flag + ~((uint32_t)0);
     mask1 = ~mask0;
     *r = (*r & mask0) | (*a & mask1);
 }
 
+static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
+    int i;
+    *r = 0;
+    for (i = 0; i < EXHAUSTIVE_TEST_ORDER; i++)
+        if ((i * *x) % EXHAUSTIVE_TEST_ORDER == 1)
+            *r = i;
+    /* If this VERIFY_CHECK triggers we were given a noninvertible scalar (and thus
+     * have a composite group order; fix it in exhaustive_tests.c). */
+    VERIFY_CHECK(*r != 0);
+}
+
+static void secp256k1_scalar_inverse_var(secp256k1_scalar *r, const secp256k1_scalar *x) {
+    secp256k1_scalar_inverse(r, x);
+}
+
 #endif /* SECP256K1_SCALAR_REPR_IMPL_H */