From caf32fd41f30b9f4eeffa37529a2aafe2259cd52 Mon Sep 17 00:00:00 2001
From: djm <djm@openbsd.org>
Date: Sun, 15 Sep 2024 02:20:51 +0000
Subject: [PATCH] update the Streamlined NTRU Prime code from the "ref"
 implementation in SUPERCOP 20201130 to the "compact" implementation in
 SUPERCOP 20240808. The new version is substantially faster. Thanks to Daniel
 J Bernstein for pointing out the new implementation (and of course for
 writing it).

tested in snaps/ok deraadt@
---
 usr.bin/ssh/kexsntrup761x25519.c |    6 +-
 usr.bin/ssh/sntrup761.c          | 2886 +++++++++++++++++++-----------
 usr.bin/ssh/sntrup761.sh         |   57 +-
 3 files changed, 1925 insertions(+), 1024 deletions(-)

diff --git a/usr.bin/ssh/kexsntrup761x25519.c b/usr.bin/ssh/kexsntrup761x25519.c
index e3f7831d39f..c2055d76c39 100644
--- a/usr.bin/ssh/kexsntrup761x25519.c
+++ b/usr.bin/ssh/kexsntrup761x25519.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: kexsntrup761x25519.c,v 1.2 2021/12/05 12:28:27 jsg Exp $ */
+/* $OpenBSD: kexsntrup761x25519.c,v 1.3 2024/09/15 02:20:51 djm Exp $ */
 /*
  * Copyright (c) 2019 Markus Friedl.  All rights reserved.
  *
@@ -35,6 +35,10 @@
 #include "digest.h"
 #include "ssherr.h"
 
+volatile crypto_int16 crypto_int16_optblocker = 0;
+volatile crypto_int32 crypto_int32_optblocker = 0;
+volatile crypto_int64 crypto_int64_optblocker = 0;
+
 int
 kex_kem_sntrup761x25519_keypair(struct kex *kex)
 {
diff --git a/usr.bin/ssh/sntrup761.c b/usr.bin/ssh/sntrup761.c
index 3ec225a0af4..81ae5dc86d7 100644
--- a/usr.bin/ssh/sntrup761.c
+++ b/usr.bin/ssh/sntrup761.c
@@ -1,4 +1,4 @@
-/*  $OpenBSD: sntrup761.c,v 1.6 2023/01/11 02:13:52 djm Exp $ */
+/*  $OpenBSD: sntrup761.c,v 1.7 2024/09/15 02:20:51 djm Exp $ */
 
 /*
  * Public Domain, Authors:
@@ -11,6 +11,8 @@
 #include <string.h>
 #include "crypto_api.h"
 
+#define crypto_declassify(x, y) do {} while (0)
+
 #define int8 crypto_int8
 #define uint8 crypto_uint8
 #define int16 crypto_int16
@@ -19,1251 +21,2133 @@
 #define uint32 crypto_uint32
 #define int64 crypto_int64
 #define uint64 crypto_uint64
+extern volatile crypto_int16 crypto_int16_optblocker;
+extern volatile crypto_int32 crypto_int32_optblocker;
+extern volatile crypto_int64 crypto_int64_optblocker;
 
-/* from supercop-20201130/crypto_sort/int32/portable4/int32_minmax.inc */
-#define int32_MINMAX(a,b) \
-do { \
-  int64_t ab = (int64_t)b ^ (int64_t)a; \
-  int64_t c = (int64_t)b - (int64_t)a; \
-  c ^= ab & (c ^ b); \
-  c >>= 31; \
-  c &= ab; \
-  a ^= c; \
-  b ^= c; \
-} while(0)
-
-/* from supercop-20201130/crypto_sort/int32/portable4/sort.c */
+/* from supercop-20240808/cryptoint/crypto_int16.h */
+/* auto-generated: cd cryptoint; ./autogen */
+/* cryptoint 20240806 */
 
+#ifndef crypto_int16_h
+#define crypto_int16_h
 
-static void crypto_sort_int32(void *array,long long n)
-{
-  long long top,p,q,r,i,j;
-  int32 *x = array;
-
-  if (n < 2) return;
-  top = 1;
-  while (top < n - top) top += top;
+#define crypto_int16 int16_t
+#define crypto_int16_unsigned uint16_t
 
-  for (p = top;p >= 1;p >>= 1) {
-    i = 0;
-    while (i + 2 * p <= n) {
-      for (j = i;j < i + p;++j)
-        int32_MINMAX(x[j],x[j+p]);
-      i += 2 * p;
-    }
-    for (j = i;j < n - p;++j)
-      int32_MINMAX(x[j],x[j+p]);
 
-    i = 0;
-    j = 0;
-    for (q = top;q > p;q >>= 1) {
-      if (j != i) for (;;) {
-        if (j == n - q) goto done;
-        int32 a = x[j + p];
-        for (r = q;r > p;r >>= 1)
-          int32_MINMAX(a,x[j + r]);
-        x[j + p] = a;
-        ++j;
-        if (j == i + p) {
-          i += 2 * p;
-          break;
-        }
-      }
-      while (i + p <= n - q) {
-        for (j = i;j < i + p;++j) {
-          int32 a = x[j + p];
-          for (r = q;r > p;r >>= 1)
-            int32_MINMAX(a,x[j+r]);
-          x[j + p] = a;
-        }
-        i += 2 * p;
-      }
-      /* now i + p > n - q */
-      j = i;
-      while (j < n - q) {
-        int32 a = x[j + p];
-        for (r = q;r > p;r >>= 1)
-          int32_MINMAX(a,x[j+r]);
-        x[j + p] = a;
-        ++j;
-      }
 
-      done: ;
-    }
-  }
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_load(const unsigned char *crypto_int16_s) {
+  crypto_int16 crypto_int16_z = 0;
+  crypto_int16_z |= ((crypto_int16) (*crypto_int16_s++)) << 0;
+  crypto_int16_z |= ((crypto_int16) (*crypto_int16_s++)) << 8;
+  return crypto_int16_z;
 }
 
-/* from supercop-20201130/crypto_sort/uint32/useint32/sort.c */
-
-/* can save time by vectorizing xor loops */
-/* can save time by integrating xor loops with int32_sort */
-
-static void crypto_sort_uint32(void *array,long long n)
-{
-  crypto_uint32 *x = array;
-  long long j;
-  for (j = 0;j < n;++j) x[j] ^= 0x80000000;
-  crypto_sort_int32(array,n);
-  for (j = 0;j < n;++j) x[j] ^= 0x80000000;
+__attribute__((unused))
+static inline
+void crypto_int16_store(unsigned char *crypto_int16_s,crypto_int16 crypto_int16_x) {
+  *crypto_int16_s++ = crypto_int16_x >> 0;
+  *crypto_int16_s++ = crypto_int16_x >> 8;
 }
 
-/* from supercop-20201130/crypto_kem/sntrup761/ref/uint32.c */
-
-/*
-CPU division instruction typically takes time depending on x.
-This software is designed to take time independent of x.
-Time still varies depending on m; user must ensure that m is constant.
-Time also varies on CPUs where multiplication is variable-time.
-There could be more CPU issues.
-There could also be compiler issues.
-*/
-
-static void uint32_divmod_uint14(uint32 *q,uint16 *r,uint32 x,uint16 m)
-{
-  uint32 v = 0x80000000;
-  uint32 qpart;
-  uint32 mask;
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_negative_mask(crypto_int16 crypto_int16_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("sarw $15,%0" : "+r"(crypto_int16_x) : : "cc");
+  return crypto_int16_x;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int16 crypto_int16_y;
+  __asm__ ("sbfx %w0,%w1,15,1" : "=r"(crypto_int16_y) : "r"(crypto_int16_x) : );
+  return crypto_int16_y;
+#else
+  crypto_int16_x >>= 16-6;
+  crypto_int16_x ^= crypto_int16_optblocker;
+  crypto_int16_x >>= 5;
+  return crypto_int16_x;
+#endif
+}
 
-  v /= m;
+__attribute__((unused))
+static inline
+crypto_int16_unsigned crypto_int16_unsigned_topbit_01(crypto_int16_unsigned crypto_int16_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("shrw $15,%0" : "+r"(crypto_int16_x) : : "cc");
+  return crypto_int16_x;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int16 crypto_int16_y;
+  __asm__ ("ubfx %w0,%w1,15,1" : "=r"(crypto_int16_y) : "r"(crypto_int16_x) : );
+  return crypto_int16_y;
+#else
+  crypto_int16_x >>= 16-6;
+  crypto_int16_x ^= crypto_int16_optblocker;
+  crypto_int16_x >>= 5;
+  return crypto_int16_x;
+#endif
+}
 
-  /* caller guarantees m > 0 */
-  /* caller guarantees m < 16384 */
-  /* vm <= 2^31 <= vm+m-1 */
-  /* xvm <= 2^31 x <= xvm+x(m-1) */
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_negative_01(crypto_int16 crypto_int16_x) {
+  return crypto_int16_unsigned_topbit_01(crypto_int16_x);
+}
 
-  *q = 0;
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_topbit_mask(crypto_int16 crypto_int16_x) {
+  return crypto_int16_negative_mask(crypto_int16_x);
+}
 
-  qpart = (x*(uint64)v)>>31;
-  /* 2^31 qpart <= xv <= 2^31 qpart + 2^31-1 */
-  /* 2^31 qpart m <= xvm <= 2^31 qpart m + (2^31-1)m */
-  /* 2^31 qpart m <= 2^31 x <= 2^31 qpart m + (2^31-1)m + x(m-1) */
-  /* 0 <= 2^31 newx <= (2^31-1)m + x(m-1) */
-  /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */
-  /* 0 <= newx <= (1-1/2^31)(2^14-1) + (2^32-1)((2^14-1)-1)/2^31 */
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_topbit_01(crypto_int16 crypto_int16_x) {
+  return crypto_int16_unsigned_topbit_01(crypto_int16_x);
+}
 
-  x -= qpart*m; *q += qpart;
-  /* x <= 49146 */
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_bottombit_mask(crypto_int16 crypto_int16_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("andw $1,%0" : "+r"(crypto_int16_x) : : "cc");
+  return -crypto_int16_x;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int16 crypto_int16_y;
+  __asm__ ("sbfx %w0,%w1,0,1" : "=r"(crypto_int16_y) : "r"(crypto_int16_x) : );
+  return crypto_int16_y;
+#else
+  crypto_int16_x &= 1 ^ crypto_int16_optblocker;
+  return -crypto_int16_x;
+#endif
+}
 
-  qpart = (x*(uint64)v)>>31;
-  /* 0 <= newx <= (1-1/2^31)m + x(m-1)/2^31 */
-  /* 0 <= newx <= m + 49146(2^14-1)/2^31 */
-  /* 0 <= newx <= m + 0.4 */
-  /* 0 <= newx <= m */
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_bottombit_01(crypto_int16 crypto_int16_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("andw $1,%0" : "+r"(crypto_int16_x) : : "cc");
+  return crypto_int16_x;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int16 crypto_int16_y;
+  __asm__ ("ubfx %w0,%w1,0,1" : "=r"(crypto_int16_y) : "r"(crypto_int16_x) : );
+  return crypto_int16_y;
+#else
+  crypto_int16_x &= 1 ^ crypto_int16_optblocker;
+  return crypto_int16_x;
+#endif
+}
 
-  x -= qpart*m; *q += qpart;
-  /* x <= m */
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_bitinrangepublicpos_mask(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_s) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("sarw %%cl,%0" : "+r"(crypto_int16_x) : "c"(crypto_int16_s) : "cc");
+#elif defined(__GNUC__) && defined(__aarch64__)
+  __asm__ ("sxth %w0,%w0\n asr %w0,%w0,%w1" : "+&r"(crypto_int16_x) : "r"(crypto_int16_s) : );
+#else
+  crypto_int16_x >>= crypto_int16_s ^ crypto_int16_optblocker;
+#endif
+  return crypto_int16_bottombit_mask(crypto_int16_x);
+}
 
-  x -= m; *q += 1;
-  mask = -(x>>31);
-  x += mask&(uint32)m; *q += mask;
-  /* x < m */
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_bitinrangepublicpos_01(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_s) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("sarw %%cl,%0" : "+r"(crypto_int16_x) : "c"(crypto_int16_s) : "cc");
+#elif defined(__GNUC__) && defined(__aarch64__)
+  __asm__ ("sxth %w0,%w0\n asr %w0,%w0,%w1" : "+&r"(crypto_int16_x) : "r"(crypto_int16_s) : );
+#else
+  crypto_int16_x >>= crypto_int16_s ^ crypto_int16_optblocker;
+#endif
+  return crypto_int16_bottombit_01(crypto_int16_x);
+}
 
-  *r = x;
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_shlmod(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_s) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int16_s &= 15;
+  __asm__ ("shlw %%cl,%0" : "+r"(crypto_int16_x) : "c"(crypto_int16_s) : "cc");
+#elif defined(__GNUC__) && defined(__aarch64__)
+  __asm__ ("and %w0,%w0,15\n and %w1,%w1,65535\n lsl %w1,%w1,%w0" : "+&r"(crypto_int16_s), "+r"(crypto_int16_x) : : );
+#else
+  int crypto_int16_k, crypto_int16_l;
+  for (crypto_int16_l = 0,crypto_int16_k = 1;crypto_int16_k < 16;++crypto_int16_l,crypto_int16_k *= 2)
+    crypto_int16_x ^= (crypto_int16_x ^ (crypto_int16_x << crypto_int16_k)) & crypto_int16_bitinrangepublicpos_mask(crypto_int16_s,crypto_int16_l);
+#endif
+  return crypto_int16_x;
 }
 
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_shrmod(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_s) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int16_s &= 15;
+  __asm__ ("sarw %%cl,%0" : "+r"(crypto_int16_x) : "c"(crypto_int16_s) : "cc");
+#elif defined(__GNUC__) && defined(__aarch64__)
+  __asm__ ("and %w0,%w0,15\n sxth %w1,%w1\n asr %w1,%w1,%w0" : "+&r"(crypto_int16_s), "+r"(crypto_int16_x) : : );
+#else
+  int crypto_int16_k, crypto_int16_l;
+  for (crypto_int16_l = 0,crypto_int16_k = 1;crypto_int16_k < 16;++crypto_int16_l,crypto_int16_k *= 2)
+    crypto_int16_x ^= (crypto_int16_x ^ (crypto_int16_x >> crypto_int16_k)) & crypto_int16_bitinrangepublicpos_mask(crypto_int16_s,crypto_int16_l);
+#endif
+  return crypto_int16_x;
+}
+
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_bitmod_mask(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_s) {
+  crypto_int16_x = crypto_int16_shrmod(crypto_int16_x,crypto_int16_s);
+  return crypto_int16_bottombit_mask(crypto_int16_x);
+}
+
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_bitmod_01(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_s) {
+  crypto_int16_x = crypto_int16_shrmod(crypto_int16_x,crypto_int16_s);
+  return crypto_int16_bottombit_01(crypto_int16_x);
+}
+
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_nonzero_mask(crypto_int16 crypto_int16_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int16 crypto_int16_q,crypto_int16_z;
+  __asm__ ("xorw %0,%0\n movw $-1,%1\n testw %2,%2\n cmovnew %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x) : "cc");
+  return crypto_int16_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int16 crypto_int16_z;
+  __asm__ ("tst %w1,65535\n csetm %w0,ne" : "=r"(crypto_int16_z) : "r"(crypto_int16_x) : "cc");
+  return crypto_int16_z;
+#else
+  crypto_int16_x |= -crypto_int16_x;
+  return crypto_int16_negative_mask(crypto_int16_x);
+#endif
+}
 
-static uint16 uint32_mod_uint14(uint32 x,uint16 m)
-{
-  uint32 q;
-  uint16 r;
-  uint32_divmod_uint14(&q,&r,x,m);
-  return r;
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_nonzero_01(crypto_int16 crypto_int16_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int16 crypto_int16_q,crypto_int16_z;
+  __asm__ ("xorw %0,%0\n movw $1,%1\n testw %2,%2\n cmovnew %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x) : "cc");
+  return crypto_int16_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int16 crypto_int16_z;
+  __asm__ ("tst %w1,65535\n cset %w0,ne" : "=r"(crypto_int16_z) : "r"(crypto_int16_x) : "cc");
+  return crypto_int16_z;
+#else
+  crypto_int16_x |= -crypto_int16_x;
+  return crypto_int16_unsigned_topbit_01(crypto_int16_x);
+#endif
 }
 
-/* from supercop-20201130/crypto_kem/sntrup761/ref/int32.c */
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_positive_mask(crypto_int16 crypto_int16_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int16 crypto_int16_q,crypto_int16_z;
+  __asm__ ("xorw %0,%0\n movw $-1,%1\n testw %2,%2\n cmovgw %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x) : "cc");
+  return crypto_int16_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int16 crypto_int16_z;
+  __asm__ ("sxth %w0,%w1\n cmp %w0,0\n csetm %w0,gt" : "=r"(crypto_int16_z) : "r"(crypto_int16_x) : "cc");
+  return crypto_int16_z;
+#else
+  crypto_int16 crypto_int16_z = -crypto_int16_x;
+  crypto_int16_z ^= crypto_int16_x & crypto_int16_z;
+  return crypto_int16_negative_mask(crypto_int16_z);
+#endif
+}
 
-static void int32_divmod_uint14(int32 *q,uint16 *r,int32 x,uint16 m)
-{
-  uint32 uq,uq2;
-  uint16 ur,ur2;
-  uint32 mask;
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_positive_01(crypto_int16 crypto_int16_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int16 crypto_int16_q,crypto_int16_z;
+  __asm__ ("xorw %0,%0\n movw $1,%1\n testw %2,%2\n cmovgw %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x) : "cc");
+  return crypto_int16_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int16 crypto_int16_z;
+  __asm__ ("sxth %w0,%w1\n cmp %w0,0\n cset %w0,gt" : "=r"(crypto_int16_z) : "r"(crypto_int16_x) : "cc");
+  return crypto_int16_z;
+#else
+  crypto_int16 crypto_int16_z = -crypto_int16_x;
+  crypto_int16_z ^= crypto_int16_x & crypto_int16_z;
+  return crypto_int16_unsigned_topbit_01(crypto_int16_z);
+#endif
+}
 
-  uint32_divmod_uint14(&uq,&ur,0x80000000+(uint32)x,m);
-  uint32_divmod_uint14(&uq2,&ur2,0x80000000,m);
-  ur -= ur2; uq -= uq2;
-  mask = -(uint32)(ur>>15);
-  ur += mask&m; uq += mask;
-  *r = ur; *q = uq;
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_zero_mask(crypto_int16 crypto_int16_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int16 crypto_int16_q,crypto_int16_z;
+  __asm__ ("xorw %0,%0\n movw $-1,%1\n testw %2,%2\n cmovew %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x) : "cc");
+  return crypto_int16_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int16 crypto_int16_z;
+  __asm__ ("tst %w1,65535\n csetm %w0,eq" : "=r"(crypto_int16_z) : "r"(crypto_int16_x) : "cc");
+  return crypto_int16_z;
+#else
+  return ~crypto_int16_nonzero_mask(crypto_int16_x);
+#endif
 }
 
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_zero_01(crypto_int16 crypto_int16_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int16 crypto_int16_q,crypto_int16_z;
+  __asm__ ("xorw %0,%0\n movw $1,%1\n testw %2,%2\n cmovew %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x) : "cc");
+  return crypto_int16_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int16 crypto_int16_z;
+  __asm__ ("tst %w1,65535\n cset %w0,eq" : "=r"(crypto_int16_z) : "r"(crypto_int16_x) : "cc");
+  return crypto_int16_z;
+#else
+  return 1-crypto_int16_nonzero_01(crypto_int16_x);
+#endif
+}
 
-static uint16 int32_mod_uint14(int32 x,uint16 m)
-{
-  int32 q;
-  uint16 r;
-  int32_divmod_uint14(&q,&r,x,m);
-  return r;
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_unequal_mask(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int16 crypto_int16_q,crypto_int16_z;
+  __asm__ ("xorw %0,%0\n movw $-1,%1\n cmpw %3,%2\n cmovnew %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc");
+  return crypto_int16_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int16 crypto_int16_z;
+  __asm__ ("and %w0,%w1,65535\n cmp %w0,%w2,uxth\n csetm %w0,ne" : "=&r"(crypto_int16_z) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc");
+  return crypto_int16_z;
+#else
+  return crypto_int16_nonzero_mask(crypto_int16_x ^ crypto_int16_y);
+#endif
 }
 
-/* from supercop-20201130/crypto_kem/sntrup761/ref/paramsmenu.h */
-/* pick one of these three: */
-#define SIZE761
-#undef SIZE653
-#undef SIZE857
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_unequal_01(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int16 crypto_int16_q,crypto_int16_z;
+  __asm__ ("xorw %0,%0\n movw $1,%1\n cmpw %3,%2\n cmovnew %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc");
+  return crypto_int16_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int16 crypto_int16_z;
+  __asm__ ("and %w0,%w1,65535\n cmp %w0,%w2,uxth\n cset %w0,ne" : "=&r"(crypto_int16_z) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc");
+  return crypto_int16_z;
+#else
+  return crypto_int16_nonzero_01(crypto_int16_x ^ crypto_int16_y);
+#endif
+}
 
-/* pick one of these two: */
-#define SNTRUP /* Streamlined NTRU Prime */
-#undef LPR /* NTRU LPRime */
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_equal_mask(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int16 crypto_int16_q,crypto_int16_z;
+  __asm__ ("xorw %0,%0\n movw $-1,%1\n cmpw %3,%2\n cmovew %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc");
+  return crypto_int16_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int16 crypto_int16_z;
+  __asm__ ("and %w0,%w1,65535\n cmp %w0,%w2,uxth\n csetm %w0,eq" : "=&r"(crypto_int16_z) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc");
+  return crypto_int16_z;
+#else
+  return ~crypto_int16_unequal_mask(crypto_int16_x,crypto_int16_y);
+#endif
+}
 
-/* from supercop-20201130/crypto_kem/sntrup761/ref/params.h */
-#ifndef params_H
-#define params_H
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_equal_01(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int16 crypto_int16_q,crypto_int16_z;
+  __asm__ ("xorw %0,%0\n movw $1,%1\n cmpw %3,%2\n cmovew %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc");
+  return crypto_int16_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int16 crypto_int16_z;
+  __asm__ ("and %w0,%w1,65535\n cmp %w0,%w2,uxth\n cset %w0,eq" : "=&r"(crypto_int16_z) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc");
+  return crypto_int16_z;
+#else
+  return 1-crypto_int16_unequal_01(crypto_int16_x,crypto_int16_y);
+#endif
+}
 
-/* menu of parameter choices: */
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_min(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("cmpw %1,%0\n cmovgw %1,%0" : "+r"(crypto_int16_x) : "r"(crypto_int16_y) : "cc");
+  return crypto_int16_x;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  __asm__ ("sxth %w0,%w0\n cmp %w0,%w1,sxth\n csel %w0,%w0,%w1,lt" : "+&r"(crypto_int16_x) : "r"(crypto_int16_y) : "cc");
+  return crypto_int16_x;
+#else
+  crypto_int16 crypto_int16_r = crypto_int16_y ^ crypto_int16_x;
+  crypto_int16 crypto_int16_z = crypto_int16_y - crypto_int16_x;
+  crypto_int16_z ^= crypto_int16_r & (crypto_int16_z ^ crypto_int16_y);
+  crypto_int16_z = crypto_int16_negative_mask(crypto_int16_z);
+  crypto_int16_z &= crypto_int16_r;
+  return crypto_int16_x ^ crypto_int16_z;
+#endif
+}
 
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_max(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("cmpw %1,%0\n cmovlw %1,%0" : "+r"(crypto_int16_x) : "r"(crypto_int16_y) : "cc");
+  return crypto_int16_x;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  __asm__ ("sxth %w0,%w0\n cmp %w0,%w1,sxth\n csel %w0,%w1,%w0,lt" : "+&r"(crypto_int16_x) : "r"(crypto_int16_y) : "cc");
+  return crypto_int16_x;
+#else
+  crypto_int16 crypto_int16_r = crypto_int16_y ^ crypto_int16_x;
+  crypto_int16 crypto_int16_z = crypto_int16_y - crypto_int16_x;
+  crypto_int16_z ^= crypto_int16_r & (crypto_int16_z ^ crypto_int16_y);
+  crypto_int16_z = crypto_int16_negative_mask(crypto_int16_z);
+  crypto_int16_z &= crypto_int16_r;
+  return crypto_int16_y ^ crypto_int16_z;
+#endif
+}
 
-/* what the menu means: */
+__attribute__((unused))
+static inline
+void crypto_int16_minmax(crypto_int16 *crypto_int16_p,crypto_int16 *crypto_int16_q) {
+  crypto_int16 crypto_int16_x = *crypto_int16_p;
+  crypto_int16 crypto_int16_y = *crypto_int16_q;
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int16 crypto_int16_z;
+  __asm__ ("cmpw %2,%1\n movw %1,%0\n cmovgw %2,%1\n cmovgw %0,%2" : "=&r"(crypto_int16_z), "+&r"(crypto_int16_x), "+r"(crypto_int16_y) : : "cc");
+  *crypto_int16_p = crypto_int16_x;
+  *crypto_int16_q = crypto_int16_y;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int16 crypto_int16_r, crypto_int16_s;
+  __asm__ ("sxth %w0,%w0\n cmp %w0,%w3,sxth\n csel %w1,%w0,%w3,lt\n csel %w2,%w3,%w0,lt" : "+&r"(crypto_int16_x), "=&r"(crypto_int16_r), "=r"(crypto_int16_s) : "r"(crypto_int16_y) : "cc");
+  *crypto_int16_p = crypto_int16_r;
+  *crypto_int16_q = crypto_int16_s;
+#else
+  crypto_int16 crypto_int16_r = crypto_int16_y ^ crypto_int16_x;
+  crypto_int16 crypto_int16_z = crypto_int16_y - crypto_int16_x;
+  crypto_int16_z ^= crypto_int16_r & (crypto_int16_z ^ crypto_int16_y);
+  crypto_int16_z = crypto_int16_negative_mask(crypto_int16_z);
+  crypto_int16_z &= crypto_int16_r;
+  crypto_int16_x ^= crypto_int16_z;
+  crypto_int16_y ^= crypto_int16_z;
+  *crypto_int16_p = crypto_int16_x;
+  *crypto_int16_q = crypto_int16_y;
+#endif
+}
 
-#if defined(SIZE761)
-#define p 761
-#define q 4591
-#define Rounded_bytes 1007
-#ifndef LPR
-#define Rq_bytes 1158
-#define w 286
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_smaller_mask(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int16 crypto_int16_q,crypto_int16_z;
+  __asm__ ("xorw %0,%0\n movw $-1,%1\n cmpw %3,%2\n cmovlw %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc");
+  return crypto_int16_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int16 crypto_int16_z;
+  __asm__ ("sxth %w0,%w1\n cmp %w0,%w2,sxth\n csetm %w0,lt" : "=&r"(crypto_int16_z) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc");
+  return crypto_int16_z;
 #else
-#define w 250
-#define tau0 2156
-#define tau1 114
-#define tau2 2007
-#define tau3 287
+  crypto_int16 crypto_int16_r = crypto_int16_x ^ crypto_int16_y;
+  crypto_int16 crypto_int16_z = crypto_int16_x - crypto_int16_y;
+  crypto_int16_z ^= crypto_int16_r & (crypto_int16_z ^ crypto_int16_x);
+  return crypto_int16_negative_mask(crypto_int16_z);
 #endif
+}
 
-#elif defined(SIZE653)
-#define p 653
-#define q 4621
-#define Rounded_bytes 865
-#ifndef LPR
-#define Rq_bytes 994
-#define w 288
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_smaller_01(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int16 crypto_int16_q,crypto_int16_z;
+  __asm__ ("xorw %0,%0\n movw $1,%1\n cmpw %3,%2\n cmovlw %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc");
+  return crypto_int16_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int16 crypto_int16_z;
+  __asm__ ("sxth %w0,%w1\n cmp %w0,%w2,sxth\n cset %w0,lt" : "=&r"(crypto_int16_z) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc");
+  return crypto_int16_z;
 #else
-#define w 252
-#define tau0 2175
-#define tau1 113
-#define tau2 2031
-#define tau3 290
+  crypto_int16 crypto_int16_r = crypto_int16_x ^ crypto_int16_y;
+  crypto_int16 crypto_int16_z = crypto_int16_x - crypto_int16_y;
+  crypto_int16_z ^= crypto_int16_r & (crypto_int16_z ^ crypto_int16_x);
+  return crypto_int16_unsigned_topbit_01(crypto_int16_z);
 #endif
+}
 
-#elif defined(SIZE857)
-#define p 857
-#define q 5167
-#define Rounded_bytes 1152
-#ifndef LPR
-#define Rq_bytes 1322
-#define w 322
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_leq_mask(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int16 crypto_int16_q,crypto_int16_z;
+  __asm__ ("xorw %0,%0\n movw $-1,%1\n cmpw %3,%2\n cmovlew %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc");
+  return crypto_int16_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int16 crypto_int16_z;
+  __asm__ ("sxth %w0,%w1\n cmp %w0,%w2,sxth\n csetm %w0,le" : "=&r"(crypto_int16_z) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc");
+  return crypto_int16_z;
 #else
-#define w 281
-#define tau0 2433
-#define tau1 101
-#define tau2 2265
-#define tau3 324
+  return ~crypto_int16_smaller_mask(crypto_int16_y,crypto_int16_x);
 #endif
+}
 
+__attribute__((unused))
+static inline
+crypto_int16 crypto_int16_leq_01(crypto_int16 crypto_int16_x,crypto_int16 crypto_int16_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int16 crypto_int16_q,crypto_int16_z;
+  __asm__ ("xorw %0,%0\n movw $1,%1\n cmpw %3,%2\n cmovlew %1,%0" : "=&r"(crypto_int16_z), "=&r"(crypto_int16_q) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc");
+  return crypto_int16_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int16 crypto_int16_z;
+  __asm__ ("sxth %w0,%w1\n cmp %w0,%w2,sxth\n cset %w0,le" : "=&r"(crypto_int16_z) : "r"(crypto_int16_x), "r"(crypto_int16_y) : "cc");
+  return crypto_int16_z;
 #else
-#error "no parameter set defined"
+  return 1-crypto_int16_smaller_01(crypto_int16_y,crypto_int16_x);
 #endif
+}
 
-#ifdef LPR
-#define I 256
+__attribute__((unused))
+static inline
+int crypto_int16_ones_num(crypto_int16 crypto_int16_x) {
+  crypto_int16_unsigned crypto_int16_y = crypto_int16_x;
+  const crypto_int16 C0 = 0x5555;
+  const crypto_int16 C1 = 0x3333;
+  const crypto_int16 C2 = 0x0f0f;
+  crypto_int16_y -= ((crypto_int16_y >> 1) & C0);
+  crypto_int16_y = (crypto_int16_y & C1) + ((crypto_int16_y >> 2) & C1);
+  crypto_int16_y = (crypto_int16_y + (crypto_int16_y >> 4)) & C2;
+  crypto_int16_y = (crypto_int16_y + (crypto_int16_y >> 8)) & 0xff;
+  return crypto_int16_y;
+}
+
+__attribute__((unused))
+static inline
+int crypto_int16_bottomzeros_num(crypto_int16 crypto_int16_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int16 fallback = 16;
+  __asm__ ("bsfw %0,%0\n cmovew %1,%0" : "+&r"(crypto_int16_x) : "r"(fallback) : "cc");
+  return crypto_int16_x;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  int64_t crypto_int16_y;
+  __asm__ ("orr %w0,%w1,-65536\n rbit %w0,%w0\n clz %w0,%w0" : "=r"(crypto_int16_y) : "r"(crypto_int16_x) : );
+  return crypto_int16_y;
+#else
+  crypto_int16 crypto_int16_y = crypto_int16_x ^ (crypto_int16_x-1);
+  crypto_int16_y = ((crypto_int16) crypto_int16_y) >> 1;
+  crypto_int16_y &= ~(crypto_int16_x & (((crypto_int16) 1) << (16-1)));
+  return crypto_int16_ones_num(crypto_int16_y);
 #endif
+}
 
 #endif
 
-/* from supercop-20201130/crypto_kem/sntrup761/ref/Decode.h */
-#ifndef Decode_H
-#define Decode_H
+/* from supercop-20240808/cryptoint/crypto_int32.h */
+/* auto-generated: cd cryptoint; ./autogen */
+/* cryptoint 20240806 */
 
+#ifndef crypto_int32_h
+#define crypto_int32_h
 
-/* Decode(R,s,M,len) */
-/* assumes 0 < M[i] < 16384 */
-/* produces 0 <= R[i] < M[i] */
+#define crypto_int32 int32_t
+#define crypto_int32_unsigned uint32_t
 
-#endif
 
-/* from supercop-20201130/crypto_kem/sntrup761/ref/Decode.c */
 
-static void Decode(uint16 *out,const unsigned char *S,const uint16 *M,long long len)
-{
-  if (len == 1) {
-    if (M[0] == 1)
-      *out = 0;
-    else if (M[0] <= 256)
-      *out = uint32_mod_uint14(S[0],M[0]);
-    else
-      *out = uint32_mod_uint14(S[0]+(((uint16)S[1])<<8),M[0]);
-  }
-  if (len > 1) {
-    uint16 R2[(len+1)/2];
-    uint16 M2[(len+1)/2];
-    uint16 bottomr[len/2];
-    uint32 bottomt[len/2];
-    long long i;
-    for (i = 0;i < len-1;i += 2) {
-      uint32 m = M[i]*(uint32) M[i+1];
-      if (m > 256*16383) {
-        bottomt[i/2] = 256*256;
-        bottomr[i/2] = S[0]+256*S[1];
-        S += 2;
-        M2[i/2] = (((m+255)>>8)+255)>>8;
-      } else if (m >= 16384) {
-        bottomt[i/2] = 256;
-        bottomr[i/2] = S[0];
-        S += 1;
-        M2[i/2] = (m+255)>>8;
-      } else {
-        bottomt[i/2] = 1;
-        bottomr[i/2] = 0;
-        M2[i/2] = m;
-      }
-    }
-    if (i < len)
-      M2[i/2] = M[i];
-    Decode(R2,S,M2,(len+1)/2);
-    for (i = 0;i < len-1;i += 2) {
-      uint32 r = bottomr[i/2];
-      uint32 r1;
-      uint16 r0;
-      r += bottomt[i/2]*R2[i/2];
-      uint32_divmod_uint14(&r1,&r0,r,M[i]);
-      r1 = uint32_mod_uint14(r1,M[i+1]); /* only needed for invalid inputs */
-      *out++ = r0;
-      *out++ = r1;
-    }
-    if (i < len)
-      *out++ = R2[i/2];
-  }
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_load(const unsigned char *crypto_int32_s) {
+  crypto_int32 crypto_int32_z = 0;
+  crypto_int32_z |= ((crypto_int32) (*crypto_int32_s++)) << 0;
+  crypto_int32_z |= ((crypto_int32) (*crypto_int32_s++)) << 8;
+  crypto_int32_z |= ((crypto_int32) (*crypto_int32_s++)) << 16;
+  crypto_int32_z |= ((crypto_int32) (*crypto_int32_s++)) << 24;
+  return crypto_int32_z;
 }
 
-/* from supercop-20201130/crypto_kem/sntrup761/ref/Encode.h */
-#ifndef Encode_H
-#define Encode_H
-
-
-/* Encode(s,R,M,len) */
-/* assumes 0 <= R[i] < M[i] < 16384 */
+__attribute__((unused))
+static inline
+void crypto_int32_store(unsigned char *crypto_int32_s,crypto_int32 crypto_int32_x) {
+  *crypto_int32_s++ = crypto_int32_x >> 0;
+  *crypto_int32_s++ = crypto_int32_x >> 8;
+  *crypto_int32_s++ = crypto_int32_x >> 16;
+  *crypto_int32_s++ = crypto_int32_x >> 24;
+}
 
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_negative_mask(crypto_int32 crypto_int32_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("sarl $31,%0" : "+r"(crypto_int32_x) : : "cc");
+  return crypto_int32_x;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int32 crypto_int32_y;
+  __asm__ ("asr %w0,%w1,31" : "=r"(crypto_int32_y) : "r"(crypto_int32_x) : );
+  return crypto_int32_y;
+#else
+  crypto_int32_x >>= 32-6;
+  crypto_int32_x ^= crypto_int32_optblocker;
+  crypto_int32_x >>= 5;
+  return crypto_int32_x;
 #endif
-
-/* from supercop-20201130/crypto_kem/sntrup761/ref/Encode.c */
-
-/* 0 <= R[i] < M[i] < 16384 */
-static void Encode(unsigned char *out,const uint16 *R,const uint16 *M,long long len)
-{
-  if (len == 1) {
-    uint16 r = R[0];
-    uint16 m = M[0];
-    while (m > 1) {
-      *out++ = r;
-      r >>= 8;
-      m = (m+255)>>8;
-    }
-  }
-  if (len > 1) {
-    uint16 R2[(len+1)/2];
-    uint16 M2[(len+1)/2];
-    long long i;
-    for (i = 0;i < len-1;i += 2) {
-      uint32 m0 = M[i];
-      uint32 r = R[i]+R[i+1]*m0;
-      uint32 m = M[i+1]*m0;
-      while (m >= 16384) {
-        *out++ = r;
-        r >>= 8;
-        m = (m+255)>>8;
-      }
-      R2[i/2] = r;
-      M2[i/2] = m;
-    }
-    if (i < len) {
-      R2[i/2] = R[i];
-      M2[i/2] = M[i];
-    }
-    Encode(out,R2,M2,(len+1)/2);
-  }
 }
 
-/* from supercop-20201130/crypto_kem/sntrup761/ref/kem.c */
-
-#ifdef LPR
+__attribute__((unused))
+static inline
+crypto_int32_unsigned crypto_int32_unsigned_topbit_01(crypto_int32_unsigned crypto_int32_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("shrl $31,%0" : "+r"(crypto_int32_x) : : "cc");
+  return crypto_int32_x;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int32 crypto_int32_y;
+  __asm__ ("lsr %w0,%w1,31" : "=r"(crypto_int32_y) : "r"(crypto_int32_x) : );
+  return crypto_int32_y;
+#else
+  crypto_int32_x >>= 32-6;
+  crypto_int32_x ^= crypto_int32_optblocker;
+  crypto_int32_x >>= 5;
+  return crypto_int32_x;
 #endif
+}
 
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_negative_01(crypto_int32 crypto_int32_x) {
+  return crypto_int32_unsigned_topbit_01(crypto_int32_x);
+}
 
-/* ----- masks */
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_topbit_mask(crypto_int32 crypto_int32_x) {
+  return crypto_int32_negative_mask(crypto_int32_x);
+}
 
-#ifndef LPR
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_topbit_01(crypto_int32 crypto_int32_x) {
+  return crypto_int32_unsigned_topbit_01(crypto_int32_x);
+}
 
-/* return -1 if x!=0; else return 0 */
-static int int16_nonzero_mask(int16 x)
-{
-  uint16 u = x; /* 0, else 1...65535 */
-  uint32 v = u; /* 0, else 1...65535 */
-  v = -v; /* 0, else 2^32-65535...2^32-1 */
-  v >>= 31; /* 0, else 1 */
-  return -v; /* 0, else -1 */
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_bottombit_mask(crypto_int32 crypto_int32_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("andl $1,%0" : "+r"(crypto_int32_x) : : "cc");
+  return -crypto_int32_x;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int32 crypto_int32_y;
+  __asm__ ("sbfx %w0,%w1,0,1" : "=r"(crypto_int32_y) : "r"(crypto_int32_x) : );
+  return crypto_int32_y;
+#else
+  crypto_int32_x &= 1 ^ crypto_int32_optblocker;
+  return -crypto_int32_x;
+#endif
 }
 
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_bottombit_01(crypto_int32 crypto_int32_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("andl $1,%0" : "+r"(crypto_int32_x) : : "cc");
+  return crypto_int32_x;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int32 crypto_int32_y;
+  __asm__ ("ubfx %w0,%w1,0,1" : "=r"(crypto_int32_y) : "r"(crypto_int32_x) : );
+  return crypto_int32_y;
+#else
+  crypto_int32_x &= 1 ^ crypto_int32_optblocker;
+  return crypto_int32_x;
 #endif
+}
 
-/* return -1 if x<0; otherwise return 0 */
-static int int16_negative_mask(int16 x)
-{
-  uint16 u = x;
-  u >>= 15;
-  return -(int) u;
-  /* alternative with gcc -fwrapv: */
-  /* x>>15 compiles to CPU's arithmetic right shift */
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_bitinrangepublicpos_mask(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_s) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("sarl %%cl,%0" : "+r"(crypto_int32_x) : "c"(crypto_int32_s) : "cc");
+#elif defined(__GNUC__) && defined(__aarch64__)
+  __asm__ ("asr %w0,%w0,%w1" : "+r"(crypto_int32_x) : "r"(crypto_int32_s) : );
+#else
+  crypto_int32_x >>= crypto_int32_s ^ crypto_int32_optblocker;
+#endif
+  return crypto_int32_bottombit_mask(crypto_int32_x);
 }
 
-/* ----- arithmetic mod 3 */
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_bitinrangepublicpos_01(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_s) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("sarl %%cl,%0" : "+r"(crypto_int32_x) : "c"(crypto_int32_s) : "cc");
+#elif defined(__GNUC__) && defined(__aarch64__)
+  __asm__ ("asr %w0,%w0,%w1" : "+r"(crypto_int32_x) : "r"(crypto_int32_s) : );
+#else
+  crypto_int32_x >>= crypto_int32_s ^ crypto_int32_optblocker;
+#endif
+  return crypto_int32_bottombit_01(crypto_int32_x);
+}
 
-typedef int8 small;
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_shlmod(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_s) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("shll %%cl,%0" : "+r"(crypto_int32_x) : "c"(crypto_int32_s) : "cc");
+#elif defined(__GNUC__) && defined(__aarch64__)
+  __asm__ ("lsl %w0,%w0,%w1" : "+r"(crypto_int32_x) : "r"(crypto_int32_s) : );
+#else
+  int crypto_int32_k, crypto_int32_l;
+  for (crypto_int32_l = 0,crypto_int32_k = 1;crypto_int32_k < 32;++crypto_int32_l,crypto_int32_k *= 2)
+    crypto_int32_x ^= (crypto_int32_x ^ (crypto_int32_x << crypto_int32_k)) & crypto_int32_bitinrangepublicpos_mask(crypto_int32_s,crypto_int32_l);
+#endif
+  return crypto_int32_x;
+}
 
-/* F3 is always represented as -1,0,1 */
-/* so ZZ_fromF3 is a no-op */
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_shrmod(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_s) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("sarl %%cl,%0" : "+r"(crypto_int32_x) : "c"(crypto_int32_s) : "cc");
+#elif defined(__GNUC__) && defined(__aarch64__)
+  __asm__ ("asr %w0,%w0,%w1" : "+r"(crypto_int32_x) : "r"(crypto_int32_s) : );
+#else
+  int crypto_int32_k, crypto_int32_l;
+  for (crypto_int32_l = 0,crypto_int32_k = 1;crypto_int32_k < 32;++crypto_int32_l,crypto_int32_k *= 2)
+    crypto_int32_x ^= (crypto_int32_x ^ (crypto_int32_x >> crypto_int32_k)) & crypto_int32_bitinrangepublicpos_mask(crypto_int32_s,crypto_int32_l);
+#endif
+  return crypto_int32_x;
+}
+
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_bitmod_mask(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_s) {
+  crypto_int32_x = crypto_int32_shrmod(crypto_int32_x,crypto_int32_s);
+  return crypto_int32_bottombit_mask(crypto_int32_x);
+}
+
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_bitmod_01(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_s) {
+  crypto_int32_x = crypto_int32_shrmod(crypto_int32_x,crypto_int32_s);
+  return crypto_int32_bottombit_01(crypto_int32_x);
+}
+
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_nonzero_mask(crypto_int32 crypto_int32_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int32 crypto_int32_q,crypto_int32_z;
+  __asm__ ("xorl %0,%0\n movl $-1,%1\n testl %2,%2\n cmovnel %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x) : "cc");
+  return crypto_int32_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int32 crypto_int32_z;
+  __asm__ ("cmp %w1,0\n csetm %w0,ne" : "=r"(crypto_int32_z) : "r"(crypto_int32_x) : "cc");
+  return crypto_int32_z;
+#else
+  crypto_int32_x |= -crypto_int32_x;
+  return crypto_int32_negative_mask(crypto_int32_x);
+#endif
+}
 
-/* x must not be close to top int16 */
-static small F3_freeze(int16 x)
-{
-  return int32_mod_uint14(x+1,3)-1;
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_nonzero_01(crypto_int32 crypto_int32_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int32 crypto_int32_q,crypto_int32_z;
+  __asm__ ("xorl %0,%0\n movl $1,%1\n testl %2,%2\n cmovnel %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x) : "cc");
+  return crypto_int32_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int32 crypto_int32_z;
+  __asm__ ("cmp %w1,0\n cset %w0,ne" : "=r"(crypto_int32_z) : "r"(crypto_int32_x) : "cc");
+  return crypto_int32_z;
+#else
+  crypto_int32_x |= -crypto_int32_x;
+  return crypto_int32_unsigned_topbit_01(crypto_int32_x);
+#endif
 }
 
-/* ----- arithmetic mod q */
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_positive_mask(crypto_int32 crypto_int32_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int32 crypto_int32_q,crypto_int32_z;
+  __asm__ ("xorl %0,%0\n movl $-1,%1\n testl %2,%2\n cmovgl %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x) : "cc");
+  return crypto_int32_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int32 crypto_int32_z;
+  __asm__ ("cmp %w1,0\n csetm %w0,gt" : "=r"(crypto_int32_z) : "r"(crypto_int32_x) : "cc");
+  return crypto_int32_z;
+#else
+  crypto_int32 crypto_int32_z = -crypto_int32_x;
+  crypto_int32_z ^= crypto_int32_x & crypto_int32_z;
+  return crypto_int32_negative_mask(crypto_int32_z);
+#endif
+}
 
-#define q12 ((q-1)/2)
-typedef int16 Fq;
-/* always represented as -q12...q12 */
-/* so ZZ_fromFq is a no-op */
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_positive_01(crypto_int32 crypto_int32_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int32 crypto_int32_q,crypto_int32_z;
+  __asm__ ("xorl %0,%0\n movl $1,%1\n testl %2,%2\n cmovgl %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x) : "cc");
+  return crypto_int32_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int32 crypto_int32_z;
+  __asm__ ("cmp %w1,0\n cset %w0,gt" : "=r"(crypto_int32_z) : "r"(crypto_int32_x) : "cc");
+  return crypto_int32_z;
+#else
+  crypto_int32 crypto_int32_z = -crypto_int32_x;
+  crypto_int32_z ^= crypto_int32_x & crypto_int32_z;
+  return crypto_int32_unsigned_topbit_01(crypto_int32_z);
+#endif
+}
 
-/* x must not be close to top int32 */
-static Fq Fq_freeze(int32 x)
-{
-  return int32_mod_uint14(x+q12,q)-q12;
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_zero_mask(crypto_int32 crypto_int32_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int32 crypto_int32_q,crypto_int32_z;
+  __asm__ ("xorl %0,%0\n movl $-1,%1\n testl %2,%2\n cmovel %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x) : "cc");
+  return crypto_int32_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int32 crypto_int32_z;
+  __asm__ ("cmp %w1,0\n csetm %w0,eq" : "=r"(crypto_int32_z) : "r"(crypto_int32_x) : "cc");
+  return crypto_int32_z;
+#else
+  return ~crypto_int32_nonzero_mask(crypto_int32_x);
+#endif
 }
 
-#ifndef LPR
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_zero_01(crypto_int32 crypto_int32_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int32 crypto_int32_q,crypto_int32_z;
+  __asm__ ("xorl %0,%0\n movl $1,%1\n testl %2,%2\n cmovel %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x) : "cc");
+  return crypto_int32_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int32 crypto_int32_z;
+  __asm__ ("cmp %w1,0\n cset %w0,eq" : "=r"(crypto_int32_z) : "r"(crypto_int32_x) : "cc");
+  return crypto_int32_z;
+#else
+  return 1-crypto_int32_nonzero_01(crypto_int32_x);
+#endif
+}
 
-static Fq Fq_recip(Fq a1)
-{
-  int i = 1;
-  Fq ai = a1;
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_unequal_mask(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int32 crypto_int32_q,crypto_int32_z;
+  __asm__ ("xorl %0,%0\n movl $-1,%1\n cmpl %3,%2\n cmovnel %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc");
+  return crypto_int32_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int32 crypto_int32_z;
+  __asm__ ("cmp %w1,%w2\n csetm %w0,ne" : "=r"(crypto_int32_z) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc");
+  return crypto_int32_z;
+#else
+  return crypto_int32_nonzero_mask(crypto_int32_x ^ crypto_int32_y);
+#endif
+}
 
-  while (i < q-2) {
-    ai = Fq_freeze(a1*(int32)ai);
-    i += 1;
-  }
-  return ai;
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_unequal_01(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int32 crypto_int32_q,crypto_int32_z;
+  __asm__ ("xorl %0,%0\n movl $1,%1\n cmpl %3,%2\n cmovnel %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc");
+  return crypto_int32_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int32 crypto_int32_z;
+  __asm__ ("cmp %w1,%w2\n cset %w0,ne" : "=r"(crypto_int32_z) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc");
+  return crypto_int32_z;
+#else
+  return crypto_int32_nonzero_01(crypto_int32_x ^ crypto_int32_y);
+#endif
 }
 
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_equal_mask(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int32 crypto_int32_q,crypto_int32_z;
+  __asm__ ("xorl %0,%0\n movl $-1,%1\n cmpl %3,%2\n cmovel %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc");
+  return crypto_int32_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int32 crypto_int32_z;
+  __asm__ ("cmp %w1,%w2\n csetm %w0,eq" : "=r"(crypto_int32_z) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc");
+  return crypto_int32_z;
+#else
+  return ~crypto_int32_unequal_mask(crypto_int32_x,crypto_int32_y);
 #endif
+}
 
-/* ----- Top and Right */
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_equal_01(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int32 crypto_int32_q,crypto_int32_z;
+  __asm__ ("xorl %0,%0\n movl $1,%1\n cmpl %3,%2\n cmovel %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc");
+  return crypto_int32_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int32 crypto_int32_z;
+  __asm__ ("cmp %w1,%w2\n cset %w0,eq" : "=r"(crypto_int32_z) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc");
+  return crypto_int32_z;
+#else
+  return 1-crypto_int32_unequal_01(crypto_int32_x,crypto_int32_y);
+#endif
+}
 
-#ifdef LPR
-#define tau 16
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_min(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("cmpl %1,%0\n cmovgl %1,%0" : "+r"(crypto_int32_x) : "r"(crypto_int32_y) : "cc");
+  return crypto_int32_x;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  __asm__ ("cmp %w0,%w1\n csel %w0,%w0,%w1,lt" : "+r"(crypto_int32_x) : "r"(crypto_int32_y) : "cc");
+  return crypto_int32_x;
+#else
+  crypto_int32 crypto_int32_r = crypto_int32_y ^ crypto_int32_x;
+  crypto_int32 crypto_int32_z = crypto_int32_y - crypto_int32_x;
+  crypto_int32_z ^= crypto_int32_r & (crypto_int32_z ^ crypto_int32_y);
+  crypto_int32_z = crypto_int32_negative_mask(crypto_int32_z);
+  crypto_int32_z &= crypto_int32_r;
+  return crypto_int32_x ^ crypto_int32_z;
+#endif
+}
 
-static int8 Top(Fq C)
-{
-  return (tau1*(int32)(C+tau0)+16384)>>15;
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_max(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("cmpl %1,%0\n cmovll %1,%0" : "+r"(crypto_int32_x) : "r"(crypto_int32_y) : "cc");
+  return crypto_int32_x;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  __asm__ ("cmp %w0,%w1\n csel %w0,%w1,%w0,lt" : "+r"(crypto_int32_x) : "r"(crypto_int32_y) : "cc");
+  return crypto_int32_x;
+#else
+  crypto_int32 crypto_int32_r = crypto_int32_y ^ crypto_int32_x;
+  crypto_int32 crypto_int32_z = crypto_int32_y - crypto_int32_x;
+  crypto_int32_z ^= crypto_int32_r & (crypto_int32_z ^ crypto_int32_y);
+  crypto_int32_z = crypto_int32_negative_mask(crypto_int32_z);
+  crypto_int32_z &= crypto_int32_r;
+  return crypto_int32_y ^ crypto_int32_z;
+#endif
 }
 
-static Fq Right(int8 T)
-{
-  return Fq_freeze(tau3*(int32)T-tau2);
+__attribute__((unused))
+static inline
+void crypto_int32_minmax(crypto_int32 *crypto_int32_p,crypto_int32 *crypto_int32_q) {
+  crypto_int32 crypto_int32_x = *crypto_int32_p;
+  crypto_int32 crypto_int32_y = *crypto_int32_q;
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int32 crypto_int32_z;
+  __asm__ ("cmpl %2,%1\n movl %1,%0\n cmovgl %2,%1\n cmovgl %0,%2" : "=&r"(crypto_int32_z), "+&r"(crypto_int32_x), "+r"(crypto_int32_y) : : "cc");
+  *crypto_int32_p = crypto_int32_x;
+  *crypto_int32_q = crypto_int32_y;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int32 crypto_int32_r, crypto_int32_s;
+  __asm__ ("cmp %w2,%w3\n csel %w0,%w2,%w3,lt\n csel %w1,%w3,%w2,lt" : "=&r"(crypto_int32_r), "=r"(crypto_int32_s) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc");
+  *crypto_int32_p = crypto_int32_r;
+  *crypto_int32_q = crypto_int32_s;
+#else
+  crypto_int32 crypto_int32_r = crypto_int32_y ^ crypto_int32_x;
+  crypto_int32 crypto_int32_z = crypto_int32_y - crypto_int32_x;
+  crypto_int32_z ^= crypto_int32_r & (crypto_int32_z ^ crypto_int32_y);
+  crypto_int32_z = crypto_int32_negative_mask(crypto_int32_z);
+  crypto_int32_z &= crypto_int32_r;
+  crypto_int32_x ^= crypto_int32_z;
+  crypto_int32_y ^= crypto_int32_z;
+  *crypto_int32_p = crypto_int32_x;
+  *crypto_int32_q = crypto_int32_y;
+#endif
 }
+
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_smaller_mask(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int32 crypto_int32_q,crypto_int32_z;
+  __asm__ ("xorl %0,%0\n movl $-1,%1\n cmpl %3,%2\n cmovll %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc");
+  return crypto_int32_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int32 crypto_int32_z;
+  __asm__ ("cmp %w1,%w2\n csetm %w0,lt" : "=r"(crypto_int32_z) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc");
+  return crypto_int32_z;
+#else
+  crypto_int32 crypto_int32_r = crypto_int32_x ^ crypto_int32_y;
+  crypto_int32 crypto_int32_z = crypto_int32_x - crypto_int32_y;
+  crypto_int32_z ^= crypto_int32_r & (crypto_int32_z ^ crypto_int32_x);
+  return crypto_int32_negative_mask(crypto_int32_z);
 #endif
+}
 
-/* ----- small polynomials */
-
-#ifndef LPR
-
-/* 0 if Weightw_is(r), else -1 */
-static int Weightw_mask(small *r)
-{
-  int weight = 0;
-  int i;
-
-  for (i = 0;i < p;++i) weight += r[i]&1;
-  return int16_nonzero_mask(weight-w);
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_smaller_01(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int32 crypto_int32_q,crypto_int32_z;
+  __asm__ ("xorl %0,%0\n movl $1,%1\n cmpl %3,%2\n cmovll %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc");
+  return crypto_int32_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int32 crypto_int32_z;
+  __asm__ ("cmp %w1,%w2\n cset %w0,lt" : "=r"(crypto_int32_z) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc");
+  return crypto_int32_z;
+#else
+  crypto_int32 crypto_int32_r = crypto_int32_x ^ crypto_int32_y;
+  crypto_int32 crypto_int32_z = crypto_int32_x - crypto_int32_y;
+  crypto_int32_z ^= crypto_int32_r & (crypto_int32_z ^ crypto_int32_x);
+  return crypto_int32_unsigned_topbit_01(crypto_int32_z);
+#endif
 }
 
-/* R3_fromR(R_fromRq(r)) */
-static void R3_fromRq(small *out,const Fq *r)
-{
-  int i;
-  for (i = 0;i < p;++i) out[i] = F3_freeze(r[i]);
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_leq_mask(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int32 crypto_int32_q,crypto_int32_z;
+  __asm__ ("xorl %0,%0\n movl $-1,%1\n cmpl %3,%2\n cmovlel %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc");
+  return crypto_int32_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int32 crypto_int32_z;
+  __asm__ ("cmp %w1,%w2\n csetm %w0,le" : "=r"(crypto_int32_z) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc");
+  return crypto_int32_z;
+#else
+  return ~crypto_int32_smaller_mask(crypto_int32_y,crypto_int32_x);
+#endif
 }
 
-/* h = f*g in the ring R3 */
-static void R3_mult(small *h,const small *f,const small *g)
-{
-  small fg[p+p-1];
-  small result;
-  int i,j;
-
-  for (i = 0;i < p;++i) {
-    result = 0;
-    for (j = 0;j <= i;++j) result = F3_freeze(result+f[j]*g[i-j]);
-    fg[i] = result;
-  }
-  for (i = p;i < p+p-1;++i) {
-    result = 0;
-    for (j = i-p+1;j < p;++j) result = F3_freeze(result+f[j]*g[i-j]);
-    fg[i] = result;
-  }
-
-  for (i = p+p-2;i >= p;--i) {
-    fg[i-p] = F3_freeze(fg[i-p]+fg[i]);
-    fg[i-p+1] = F3_freeze(fg[i-p+1]+fg[i]);
-  }
-
-  for (i = 0;i < p;++i) h[i] = fg[i];
+__attribute__((unused))
+static inline
+crypto_int32 crypto_int32_leq_01(crypto_int32 crypto_int32_x,crypto_int32 crypto_int32_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int32 crypto_int32_q,crypto_int32_z;
+  __asm__ ("xorl %0,%0\n movl $1,%1\n cmpl %3,%2\n cmovlel %1,%0" : "=&r"(crypto_int32_z), "=&r"(crypto_int32_q) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc");
+  return crypto_int32_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int32 crypto_int32_z;
+  __asm__ ("cmp %w1,%w2\n cset %w0,le" : "=r"(crypto_int32_z) : "r"(crypto_int32_x), "r"(crypto_int32_y) : "cc");
+  return crypto_int32_z;
+#else
+  return 1-crypto_int32_smaller_01(crypto_int32_y,crypto_int32_x);
+#endif
 }
 
-/* returns 0 if recip succeeded; else -1 */
-static int R3_recip(small *out,const small *in)
-{
-  small f[p+1],g[p+1],v[p+1],r[p+1];
-  int i,loop,delta;
-  int sign,swap,t;
-
-  for (i = 0;i < p+1;++i) v[i] = 0;
-  for (i = 0;i < p+1;++i) r[i] = 0;
-  r[0] = 1;
-  for (i = 0;i < p;++i) f[i] = 0;
-  f[0] = 1; f[p-1] = f[p] = -1;
-  for (i = 0;i < p;++i) g[p-1-i] = in[i];
-  g[p] = 0;
-
-  delta = 1;
-
-  for (loop = 0;loop < 2*p-1;++loop) {
-    for (i = p;i > 0;--i) v[i] = v[i-1];
-    v[0] = 0;
-
-    sign = -g[0]*f[0];
-    swap = int16_negative_mask(-delta) & int16_nonzero_mask(g[0]);
-    delta ^= swap&(delta^-delta);
-    delta += 1;
-
-    for (i = 0;i < p+1;++i) {
-      t = swap&(f[i]^g[i]); f[i] ^= t; g[i] ^= t;
-      t = swap&(v[i]^r[i]); v[i] ^= t; r[i] ^= t;
-    }
-
-    for (i = 0;i < p+1;++i) g[i] = F3_freeze(g[i]+sign*f[i]);
-    for (i = 0;i < p+1;++i) r[i] = F3_freeze(r[i]+sign*v[i]);
-
-    for (i = 0;i < p;++i) g[i] = g[i+1];
-    g[p] = 0;
-  }
-
-  sign = f[0];
-  for (i = 0;i < p;++i) out[i] = sign*v[p-1-i];
-
-  return int16_nonzero_mask(delta);
+__attribute__((unused))
+static inline
+int crypto_int32_ones_num(crypto_int32 crypto_int32_x) {
+  crypto_int32_unsigned crypto_int32_y = crypto_int32_x;
+  const crypto_int32 C0 = 0x55555555;
+  const crypto_int32 C1 = 0x33333333;
+  const crypto_int32 C2 = 0x0f0f0f0f;
+  crypto_int32_y -= ((crypto_int32_y >> 1) & C0);
+  crypto_int32_y = (crypto_int32_y & C1) + ((crypto_int32_y >> 2) & C1);
+  crypto_int32_y = (crypto_int32_y + (crypto_int32_y >> 4)) & C2;
+  crypto_int32_y += crypto_int32_y >> 8;
+  crypto_int32_y = (crypto_int32_y + (crypto_int32_y >> 16)) & 0xff;
+  return crypto_int32_y;
+}
+
+__attribute__((unused))
+static inline
+int crypto_int32_bottomzeros_num(crypto_int32 crypto_int32_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int32 fallback = 32;
+  __asm__ ("bsfl %0,%0\n cmovel %1,%0" : "+&r"(crypto_int32_x) : "r"(fallback) : "cc");
+  return crypto_int32_x;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  int64_t crypto_int32_y;
+  __asm__ ("rbit %w0,%w1\n clz %w0,%w0" : "=r"(crypto_int32_y) : "r"(crypto_int32_x) : );
+  return crypto_int32_y;
+#else
+  crypto_int32 crypto_int32_y = crypto_int32_x ^ (crypto_int32_x-1);
+  crypto_int32_y = ((crypto_int32) crypto_int32_y) >> 1;
+  crypto_int32_y &= ~(crypto_int32_x & (((crypto_int32) 1) << (32-1)));
+  return crypto_int32_ones_num(crypto_int32_y);
+#endif
 }
 
 #endif
 
-/* ----- polynomials mod q */
-
-/* h = f*g in the ring Rq */
-static void Rq_mult_small(Fq *h,const Fq *f,const small *g)
-{
-  Fq fg[p+p-1];
-  Fq result;
-  int i,j;
-
-  for (i = 0;i < p;++i) {
-    result = 0;
-    for (j = 0;j <= i;++j) result = Fq_freeze(result+f[j]*(int32)g[i-j]);
-    fg[i] = result;
-  }
-  for (i = p;i < p+p-1;++i) {
-    result = 0;
-    for (j = i-p+1;j < p;++j) result = Fq_freeze(result+f[j]*(int32)g[i-j]);
-    fg[i] = result;
-  }
-
-  for (i = p+p-2;i >= p;--i) {
-    fg[i-p] = Fq_freeze(fg[i-p]+fg[i]);
-    fg[i-p+1] = Fq_freeze(fg[i-p+1]+fg[i]);
-  }
-
-  for (i = 0;i < p;++i) h[i] = fg[i];
+/* from supercop-20240808/cryptoint/crypto_int64.h */
+/* auto-generated: cd cryptoint; ./autogen */
+/* cryptoint 20240806 */
+
+#ifndef crypto_int64_h
+#define crypto_int64_h
+
+#define crypto_int64 int64_t
+#define crypto_int64_unsigned uint64_t
+
+
+
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_load(const unsigned char *crypto_int64_s) {
+  crypto_int64 crypto_int64_z = 0;
+  crypto_int64_z |= ((crypto_int64) (*crypto_int64_s++)) << 0;
+  crypto_int64_z |= ((crypto_int64) (*crypto_int64_s++)) << 8;
+  crypto_int64_z |= ((crypto_int64) (*crypto_int64_s++)) << 16;
+  crypto_int64_z |= ((crypto_int64) (*crypto_int64_s++)) << 24;
+  crypto_int64_z |= ((crypto_int64) (*crypto_int64_s++)) << 32;
+  crypto_int64_z |= ((crypto_int64) (*crypto_int64_s++)) << 40;
+  crypto_int64_z |= ((crypto_int64) (*crypto_int64_s++)) << 48;
+  crypto_int64_z |= ((crypto_int64) (*crypto_int64_s++)) << 56;
+  return crypto_int64_z;
+}
+
+__attribute__((unused))
+static inline
+void crypto_int64_store(unsigned char *crypto_int64_s,crypto_int64 crypto_int64_x) {
+  *crypto_int64_s++ = crypto_int64_x >> 0;
+  *crypto_int64_s++ = crypto_int64_x >> 8;
+  *crypto_int64_s++ = crypto_int64_x >> 16;
+  *crypto_int64_s++ = crypto_int64_x >> 24;
+  *crypto_int64_s++ = crypto_int64_x >> 32;
+  *crypto_int64_s++ = crypto_int64_x >> 40;
+  *crypto_int64_s++ = crypto_int64_x >> 48;
+  *crypto_int64_s++ = crypto_int64_x >> 56;
+}
+
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_negative_mask(crypto_int64 crypto_int64_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("sarq $63,%0" : "+r"(crypto_int64_x) : : "cc");
+  return crypto_int64_x;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int64 crypto_int64_y;
+  __asm__ ("asr %0,%1,63" : "=r"(crypto_int64_y) : "r"(crypto_int64_x) : );
+  return crypto_int64_y;
+#else
+  crypto_int64_x >>= 64-6;
+  crypto_int64_x ^= crypto_int64_optblocker;
+  crypto_int64_x >>= 5;
+  return crypto_int64_x;
+#endif
 }
 
-#ifndef LPR
-
-/* h = 3f in Rq */
-static void Rq_mult3(Fq *h,const Fq *f)
-{
-  int i;
-
-  for (i = 0;i < p;++i) h[i] = Fq_freeze(3*f[i]);
+__attribute__((unused))
+static inline
+crypto_int64_unsigned crypto_int64_unsigned_topbit_01(crypto_int64_unsigned crypto_int64_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("shrq $63,%0" : "+r"(crypto_int64_x) : : "cc");
+  return crypto_int64_x;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int64 crypto_int64_y;
+  __asm__ ("lsr %0,%1,63" : "=r"(crypto_int64_y) : "r"(crypto_int64_x) : );
+  return crypto_int64_y;
+#else
+  crypto_int64_x >>= 64-6;
+  crypto_int64_x ^= crypto_int64_optblocker;
+  crypto_int64_x >>= 5;
+  return crypto_int64_x;
+#endif
 }
 
-/* out = 1/(3*in) in Rq */
-/* returns 0 if recip succeeded; else -1 */
-static int Rq_recip3(Fq *out,const small *in)
-{
-  Fq f[p+1],g[p+1],v[p+1],r[p+1];
-  int i,loop,delta;
-  int swap,t;
-  int32 f0,g0;
-  Fq scale;
-
-  for (i = 0;i < p+1;++i) v[i] = 0;
-  for (i = 0;i < p+1;++i) r[i] = 0;
-  r[0] = Fq_recip(3);
-  for (i = 0;i < p;++i) f[i] = 0;
-  f[0] = 1; f[p-1] = f[p] = -1;
-  for (i = 0;i < p;++i) g[p-1-i] = in[i];
-  g[p] = 0;
-
-  delta = 1;
-
-  for (loop = 0;loop < 2*p-1;++loop) {
-    for (i = p;i > 0;--i) v[i] = v[i-1];
-    v[0] = 0;
-
-    swap = int16_negative_mask(-delta) & int16_nonzero_mask(g[0]);
-    delta ^= swap&(delta^-delta);
-    delta += 1;
-
-    for (i = 0;i < p+1;++i) {
-      t = swap&(f[i]^g[i]); f[i] ^= t; g[i] ^= t;
-      t = swap&(v[i]^r[i]); v[i] ^= t; r[i] ^= t;
-    }
-
-    f0 = f[0];
-    g0 = g[0];
-    for (i = 0;i < p+1;++i) g[i] = Fq_freeze(f0*g[i]-g0*f[i]);
-    for (i = 0;i < p+1;++i) r[i] = Fq_freeze(f0*r[i]-g0*v[i]);
-
-    for (i = 0;i < p;++i) g[i] = g[i+1];
-    g[p] = 0;
-  }
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_negative_01(crypto_int64 crypto_int64_x) {
+  return crypto_int64_unsigned_topbit_01(crypto_int64_x);
+}
 
-  scale = Fq_recip(f[0]);
-  for (i = 0;i < p;++i) out[i] = Fq_freeze(scale*(int32)v[p-1-i]);
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_topbit_mask(crypto_int64 crypto_int64_x) {
+  return crypto_int64_negative_mask(crypto_int64_x);
+}
 
-  return int16_nonzero_mask(delta);
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_topbit_01(crypto_int64 crypto_int64_x) {
+  return crypto_int64_unsigned_topbit_01(crypto_int64_x);
 }
 
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_bottombit_mask(crypto_int64 crypto_int64_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("andq $1,%0" : "+r"(crypto_int64_x) : : "cc");
+  return -crypto_int64_x;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int64 crypto_int64_y;
+  __asm__ ("sbfx %0,%1,0,1" : "=r"(crypto_int64_y) : "r"(crypto_int64_x) : );
+  return crypto_int64_y;
+#else
+  crypto_int64_x &= 1 ^ crypto_int64_optblocker;
+  return -crypto_int64_x;
 #endif
-
-/* ----- rounded polynomials mod q */
-
-static void Round(Fq *out,const Fq *a)
-{
-  int i;
-  for (i = 0;i < p;++i) out[i] = a[i]-F3_freeze(a[i]);
 }
 
-/* ----- sorting to generate short polynomial */
-
-static void Short_fromlist(small *out,const uint32 *in)
-{
-  uint32 L[p];
-  int i;
-
-  for (i = 0;i < w;++i) L[i] = in[i]&(uint32)-2;
-  for (i = w;i < p;++i) L[i] = (in[i]&(uint32)-3)|1;
-  crypto_sort_uint32(L,p);
-  for (i = 0;i < p;++i) out[i] = (L[i]&3)-1;
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_bottombit_01(crypto_int64 crypto_int64_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("andq $1,%0" : "+r"(crypto_int64_x) : : "cc");
+  return crypto_int64_x;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int64 crypto_int64_y;
+  __asm__ ("ubfx %0,%1,0,1" : "=r"(crypto_int64_y) : "r"(crypto_int64_x) : );
+  return crypto_int64_y;
+#else
+  crypto_int64_x &= 1 ^ crypto_int64_optblocker;
+  return crypto_int64_x;
+#endif
 }
 
-/* ----- underlying hash function */
-
-#define Hash_bytes 32
-
-/* e.g., b = 0 means out = Hash0(in) */
-static void Hash_prefix(unsigned char *out,int b,const unsigned char *in,int inlen)
-{
-  unsigned char x[inlen+1];
-  unsigned char h[64];
-  int i;
-
-  x[0] = b;
-  for (i = 0;i < inlen;++i) x[i+1] = in[i];
-  crypto_hash_sha512(h,x,inlen+1);
-  for (i = 0;i < 32;++i) out[i] = h[i];
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_bitinrangepublicpos_mask(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_s) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("sarq %%cl,%0" : "+r"(crypto_int64_x) : "c"(crypto_int64_s) : "cc");
+#elif defined(__GNUC__) && defined(__aarch64__)
+  __asm__ ("asr %0,%0,%1" : "+r"(crypto_int64_x) : "r"(crypto_int64_s) : );
+#else
+  crypto_int64_x >>= crypto_int64_s ^ crypto_int64_optblocker;
+#endif
+  return crypto_int64_bottombit_mask(crypto_int64_x);
 }
 
-/* ----- higher-level randomness */
-
-static uint32 urandom32(void)
-{
-  unsigned char c[4];
-  uint32 out[4];
-
-  randombytes(c,4);
-  out[0] = (uint32)c[0];
-  out[1] = ((uint32)c[1])<<8;
-  out[2] = ((uint32)c[2])<<16;
-  out[3] = ((uint32)c[3])<<24;
-  return out[0]+out[1]+out[2]+out[3];
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_bitinrangepublicpos_01(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_s) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("sarq %%cl,%0" : "+r"(crypto_int64_x) : "c"(crypto_int64_s) : "cc");
+#elif defined(__GNUC__) && defined(__aarch64__)
+  __asm__ ("asr %0,%0,%1" : "+r"(crypto_int64_x) : "r"(crypto_int64_s) : );
+#else
+  crypto_int64_x >>= crypto_int64_s ^ crypto_int64_optblocker;
+#endif
+  return crypto_int64_bottombit_01(crypto_int64_x);
 }
 
-static void Short_random(small *out)
-{
-  uint32 L[p];
-  int i;
-
-  for (i = 0;i < p;++i) L[i] = urandom32();
-  Short_fromlist(out,L);
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_shlmod(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_s) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("shlq %%cl,%0" : "+r"(crypto_int64_x) : "c"(crypto_int64_s) : "cc");
+#elif defined(__GNUC__) && defined(__aarch64__)
+  __asm__ ("lsl %0,%0,%1" : "+r"(crypto_int64_x) : "r"(crypto_int64_s) : );
+#else
+  int crypto_int64_k, crypto_int64_l;
+  for (crypto_int64_l = 0,crypto_int64_k = 1;crypto_int64_k < 64;++crypto_int64_l,crypto_int64_k *= 2)
+    crypto_int64_x ^= (crypto_int64_x ^ (crypto_int64_x << crypto_int64_k)) & crypto_int64_bitinrangepublicpos_mask(crypto_int64_s,crypto_int64_l);
+#endif
+  return crypto_int64_x;
 }
 
-#ifndef LPR
-
-static void Small_random(small *out)
-{
-  int i;
-
-  for (i = 0;i < p;++i) out[i] = (((urandom32()&0x3fffffff)*3)>>30)-1;
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_shrmod(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_s) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("sarq %%cl,%0" : "+r"(crypto_int64_x) : "c"(crypto_int64_s) : "cc");
+#elif defined(__GNUC__) && defined(__aarch64__)
+  __asm__ ("asr %0,%0,%1" : "+r"(crypto_int64_x) : "r"(crypto_int64_s) : );
+#else
+  int crypto_int64_k, crypto_int64_l;
+  for (crypto_int64_l = 0,crypto_int64_k = 1;crypto_int64_k < 64;++crypto_int64_l,crypto_int64_k *= 2)
+    crypto_int64_x ^= (crypto_int64_x ^ (crypto_int64_x >> crypto_int64_k)) & crypto_int64_bitinrangepublicpos_mask(crypto_int64_s,crypto_int64_l);
+#endif
+  return crypto_int64_x;
+}
+
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_bitmod_mask(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_s) {
+  crypto_int64_x = crypto_int64_shrmod(crypto_int64_x,crypto_int64_s);
+  return crypto_int64_bottombit_mask(crypto_int64_x);
+}
+
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_bitmod_01(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_s) {
+  crypto_int64_x = crypto_int64_shrmod(crypto_int64_x,crypto_int64_s);
+  return crypto_int64_bottombit_01(crypto_int64_x);
+}
+
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_nonzero_mask(crypto_int64 crypto_int64_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int64 crypto_int64_q,crypto_int64_z;
+  __asm__ ("xorq %0,%0\n movq $-1,%1\n testq %2,%2\n cmovneq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x) : "cc");
+  return crypto_int64_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int64 crypto_int64_z;
+  __asm__ ("cmp %1,0\n csetm %0,ne" : "=r"(crypto_int64_z) : "r"(crypto_int64_x) : "cc");
+  return crypto_int64_z;
+#else
+  crypto_int64_x |= -crypto_int64_x;
+  return crypto_int64_negative_mask(crypto_int64_x);
+#endif
 }
 
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_nonzero_01(crypto_int64 crypto_int64_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int64 crypto_int64_q,crypto_int64_z;
+  __asm__ ("xorq %0,%0\n movq $1,%1\n testq %2,%2\n cmovneq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x) : "cc");
+  return crypto_int64_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int64 crypto_int64_z;
+  __asm__ ("cmp %1,0\n cset %0,ne" : "=r"(crypto_int64_z) : "r"(crypto_int64_x) : "cc");
+  return crypto_int64_z;
+#else
+  crypto_int64_x |= -crypto_int64_x;
+  return crypto_int64_unsigned_topbit_01(crypto_int64_x);
 #endif
-
-/* ----- Streamlined NTRU Prime Core */
-
-#ifndef LPR
-
-/* h,(f,ginv) = KeyGen() */
-static void KeyGen(Fq *h,small *f,small *ginv)
-{
-  small g[p];
-  Fq finv[p];
-
-  for (;;) {
-    Small_random(g);
-    if (R3_recip(ginv,g) == 0) break;
-  }
-  Short_random(f);
-  Rq_recip3(finv,f); /* always works */
-  Rq_mult_small(h,finv,g);
 }
 
-/* c = Encrypt(r,h) */
-static void Encrypt(Fq *c,const small *r,const Fq *h)
-{
-  Fq hr[p];
-
-  Rq_mult_small(hr,h,r);
-  Round(c,hr);
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_positive_mask(crypto_int64 crypto_int64_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int64 crypto_int64_q,crypto_int64_z;
+  __asm__ ("xorq %0,%0\n movq $-1,%1\n testq %2,%2\n cmovgq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x) : "cc");
+  return crypto_int64_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int64 crypto_int64_z;
+  __asm__ ("cmp %1,0\n csetm %0,gt" : "=r"(crypto_int64_z) : "r"(crypto_int64_x) : "cc");
+  return crypto_int64_z;
+#else
+  crypto_int64 crypto_int64_z = -crypto_int64_x;
+  crypto_int64_z ^= crypto_int64_x & crypto_int64_z;
+  return crypto_int64_negative_mask(crypto_int64_z);
+#endif
 }
 
-/* r = Decrypt(c,(f,ginv)) */
-static void Decrypt(small *r,const Fq *c,const small *f,const small *ginv)
-{
-  Fq cf[p];
-  Fq cf3[p];
-  small e[p];
-  small ev[p];
-  int mask;
-  int i;
-
-  Rq_mult_small(cf,c,f);
-  Rq_mult3(cf3,cf);
-  R3_fromRq(e,cf3);
-  R3_mult(ev,e,ginv);
-
-  mask = Weightw_mask(ev); /* 0 if weight w, else -1 */
-  for (i = 0;i < w;++i) r[i] = ((ev[i]^1)&~mask)^1;
-  for (i = w;i < p;++i) r[i] = ev[i]&~mask;
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_positive_01(crypto_int64 crypto_int64_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int64 crypto_int64_q,crypto_int64_z;
+  __asm__ ("xorq %0,%0\n movq $1,%1\n testq %2,%2\n cmovgq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x) : "cc");
+  return crypto_int64_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int64 crypto_int64_z;
+  __asm__ ("cmp %1,0\n cset %0,gt" : "=r"(crypto_int64_z) : "r"(crypto_int64_x) : "cc");
+  return crypto_int64_z;
+#else
+  crypto_int64 crypto_int64_z = -crypto_int64_x;
+  crypto_int64_z ^= crypto_int64_x & crypto_int64_z;
+  return crypto_int64_unsigned_topbit_01(crypto_int64_z);
+#endif
 }
 
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_zero_mask(crypto_int64 crypto_int64_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int64 crypto_int64_q,crypto_int64_z;
+  __asm__ ("xorq %0,%0\n movq $-1,%1\n testq %2,%2\n cmoveq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x) : "cc");
+  return crypto_int64_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int64 crypto_int64_z;
+  __asm__ ("cmp %1,0\n csetm %0,eq" : "=r"(crypto_int64_z) : "r"(crypto_int64_x) : "cc");
+  return crypto_int64_z;
+#else
+  return ~crypto_int64_nonzero_mask(crypto_int64_x);
 #endif
-
-/* ----- NTRU LPRime Core */
-
-#ifdef LPR
-
-/* (G,A),a = KeyGen(G); leaves G unchanged */
-static void KeyGen(Fq *A,small *a,const Fq *G)
-{
-  Fq aG[p];
-
-  Short_random(a);
-  Rq_mult_small(aG,G,a);
-  Round(A,aG);
 }
 
-/* B,T = Encrypt(r,(G,A),b) */
-static void Encrypt(Fq *B,int8 *T,const int8 *r,const Fq *G,const Fq *A,const small *b)
-{
-  Fq bG[p];
-  Fq bA[p];
-  int i;
-
-  Rq_mult_small(bG,G,b);
-  Round(B,bG);
-  Rq_mult_small(bA,A,b);
-  for (i = 0;i < I;++i) T[i] = Top(Fq_freeze(bA[i]+r[i]*q12));
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_zero_01(crypto_int64 crypto_int64_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int64 crypto_int64_q,crypto_int64_z;
+  __asm__ ("xorq %0,%0\n movq $1,%1\n testq %2,%2\n cmoveq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x) : "cc");
+  return crypto_int64_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int64 crypto_int64_z;
+  __asm__ ("cmp %1,0\n cset %0,eq" : "=r"(crypto_int64_z) : "r"(crypto_int64_x) : "cc");
+  return crypto_int64_z;
+#else
+  return 1-crypto_int64_nonzero_01(crypto_int64_x);
+#endif
 }
 
-/* r = Decrypt((B,T),a) */
-static void Decrypt(int8 *r,const Fq *B,const int8 *T,const small *a)
-{
-  Fq aB[p];
-  int i;
-
-  Rq_mult_small(aB,B,a);
-  for (i = 0;i < I;++i)
-    r[i] = -int16_negative_mask(Fq_freeze(Right(T[i])-aB[i]+4*w+1));
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_unequal_mask(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int64 crypto_int64_q,crypto_int64_z;
+  __asm__ ("xorq %0,%0\n movq $-1,%1\n cmpq %3,%2\n cmovneq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc");
+  return crypto_int64_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int64 crypto_int64_z;
+  __asm__ ("cmp %1,%2\n csetm %0,ne" : "=r"(crypto_int64_z) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc");
+  return crypto_int64_z;
+#else
+  return crypto_int64_nonzero_mask(crypto_int64_x ^ crypto_int64_y);
+#endif
 }
 
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_unequal_01(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int64 crypto_int64_q,crypto_int64_z;
+  __asm__ ("xorq %0,%0\n movq $1,%1\n cmpq %3,%2\n cmovneq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc");
+  return crypto_int64_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int64 crypto_int64_z;
+  __asm__ ("cmp %1,%2\n cset %0,ne" : "=r"(crypto_int64_z) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc");
+  return crypto_int64_z;
+#else
+  return crypto_int64_nonzero_01(crypto_int64_x ^ crypto_int64_y);
 #endif
-
-/* ----- encoding I-bit inputs */
-
-#ifdef LPR
-
-#define Inputs_bytes (I/8)
-typedef int8 Inputs[I]; /* passed by reference */
-
-static void Inputs_encode(unsigned char *s,const Inputs r)
-{
-  int i;
-  for (i = 0;i < Inputs_bytes;++i) s[i] = 0;
-  for (i = 0;i < I;++i) s[i>>3] |= r[i]<<(i&7);
 }
 
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_equal_mask(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int64 crypto_int64_q,crypto_int64_z;
+  __asm__ ("xorq %0,%0\n movq $-1,%1\n cmpq %3,%2\n cmoveq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc");
+  return crypto_int64_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int64 crypto_int64_z;
+  __asm__ ("cmp %1,%2\n csetm %0,eq" : "=r"(crypto_int64_z) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc");
+  return crypto_int64_z;
+#else
+  return ~crypto_int64_unequal_mask(crypto_int64_x,crypto_int64_y);
 #endif
-
-/* ----- Expand */
-
-#ifdef LPR
-
-static const unsigned char aes_nonce[16] = {0};
-
-static void Expand(uint32 *L,const unsigned char *k)
-{
-  int i;
-  crypto_stream_aes256ctr((unsigned char *) L,4*p,aes_nonce,k);
-  for (i = 0;i < p;++i) {
-    uint32 L0 = ((unsigned char *) L)[4*i];
-    uint32 L1 = ((unsigned char *) L)[4*i+1];
-    uint32 L2 = ((unsigned char *) L)[4*i+2];
-    uint32 L3 = ((unsigned char *) L)[4*i+3];
-    L[i] = L0+(L1<<8)+(L2<<16)+(L3<<24);
-  }
 }
 
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_equal_01(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int64 crypto_int64_q,crypto_int64_z;
+  __asm__ ("xorq %0,%0\n movq $1,%1\n cmpq %3,%2\n cmoveq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc");
+  return crypto_int64_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int64 crypto_int64_z;
+  __asm__ ("cmp %1,%2\n cset %0,eq" : "=r"(crypto_int64_z) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc");
+  return crypto_int64_z;
+#else
+  return 1-crypto_int64_unequal_01(crypto_int64_x,crypto_int64_y);
 #endif
-
-/* ----- Seeds */
-
-#ifdef LPR
-
-#define Seeds_bytes 32
-
-static void Seeds_random(unsigned char *s)
-{
-  randombytes(s,Seeds_bytes);
 }
 
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_min(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("cmpq %1,%0\n cmovgq %1,%0" : "+r"(crypto_int64_x) : "r"(crypto_int64_y) : "cc");
+  return crypto_int64_x;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  __asm__ ("cmp %0,%1\n csel %0,%0,%1,lt" : "+r"(crypto_int64_x) : "r"(crypto_int64_y) : "cc");
+  return crypto_int64_x;
+#else
+  crypto_int64 crypto_int64_r = crypto_int64_y ^ crypto_int64_x;
+  crypto_int64 crypto_int64_z = crypto_int64_y - crypto_int64_x;
+  crypto_int64_z ^= crypto_int64_r & (crypto_int64_z ^ crypto_int64_y);
+  crypto_int64_z = crypto_int64_negative_mask(crypto_int64_z);
+  crypto_int64_z &= crypto_int64_r;
+  return crypto_int64_x ^ crypto_int64_z;
 #endif
-
-/* ----- Generator, HashShort */
-
-#ifdef LPR
-
-/* G = Generator(k) */
-static void Generator(Fq *G,const unsigned char *k)
-{
-  uint32 L[p];
-  int i;
-
-  Expand(L,k);
-  for (i = 0;i < p;++i) G[i] = uint32_mod_uint14(L[i],q)-q12;
 }
 
-/* out = HashShort(r) */
-static void HashShort(small *out,const Inputs r)
-{
-  unsigned char s[Inputs_bytes];
-  unsigned char h[Hash_bytes];
-  uint32 L[p];
-
-  Inputs_encode(s,r);
-  Hash_prefix(h,5,s,sizeof s);
-  Expand(L,h);
-  Short_fromlist(out,L);
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_max(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  __asm__ ("cmpq %1,%0\n cmovlq %1,%0" : "+r"(crypto_int64_x) : "r"(crypto_int64_y) : "cc");
+  return crypto_int64_x;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  __asm__ ("cmp %0,%1\n csel %0,%1,%0,lt" : "+r"(crypto_int64_x) : "r"(crypto_int64_y) : "cc");
+  return crypto_int64_x;
+#else
+  crypto_int64 crypto_int64_r = crypto_int64_y ^ crypto_int64_x;
+  crypto_int64 crypto_int64_z = crypto_int64_y - crypto_int64_x;
+  crypto_int64_z ^= crypto_int64_r & (crypto_int64_z ^ crypto_int64_y);
+  crypto_int64_z = crypto_int64_negative_mask(crypto_int64_z);
+  crypto_int64_z &= crypto_int64_r;
+  return crypto_int64_y ^ crypto_int64_z;
+#endif
 }
 
+__attribute__((unused))
+static inline
+void crypto_int64_minmax(crypto_int64 *crypto_int64_p,crypto_int64 *crypto_int64_q) {
+  crypto_int64 crypto_int64_x = *crypto_int64_p;
+  crypto_int64 crypto_int64_y = *crypto_int64_q;
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int64 crypto_int64_z;
+  __asm__ ("cmpq %2,%1\n movq %1,%0\n cmovgq %2,%1\n cmovgq %0,%2" : "=&r"(crypto_int64_z), "+&r"(crypto_int64_x), "+r"(crypto_int64_y) : : "cc");
+  *crypto_int64_p = crypto_int64_x;
+  *crypto_int64_q = crypto_int64_y;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int64 crypto_int64_r, crypto_int64_s;
+  __asm__ ("cmp %2,%3\n csel %0,%2,%3,lt\n csel %1,%3,%2,lt" : "=&r"(crypto_int64_r), "=r"(crypto_int64_s) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc");
+  *crypto_int64_p = crypto_int64_r;
+  *crypto_int64_q = crypto_int64_s;
+#else
+  crypto_int64 crypto_int64_r = crypto_int64_y ^ crypto_int64_x;
+  crypto_int64 crypto_int64_z = crypto_int64_y - crypto_int64_x;
+  crypto_int64_z ^= crypto_int64_r & (crypto_int64_z ^ crypto_int64_y);
+  crypto_int64_z = crypto_int64_negative_mask(crypto_int64_z);
+  crypto_int64_z &= crypto_int64_r;
+  crypto_int64_x ^= crypto_int64_z;
+  crypto_int64_y ^= crypto_int64_z;
+  *crypto_int64_p = crypto_int64_x;
+  *crypto_int64_q = crypto_int64_y;
 #endif
-
-/* ----- NTRU LPRime Expand */
-
-#ifdef LPR
-
-/* (S,A),a = XKeyGen() */
-static void XKeyGen(unsigned char *S,Fq *A,small *a)
-{
-  Fq G[p];
-
-  Seeds_random(S);
-  Generator(G,S);
-  KeyGen(A,a,G);
 }
 
-/* B,T = XEncrypt(r,(S,A)) */
-static void XEncrypt(Fq *B,int8 *T,const int8 *r,const unsigned char *S,const Fq *A)
-{
-  Fq G[p];
-  small b[p];
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_smaller_mask(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int64 crypto_int64_q,crypto_int64_z;
+  __asm__ ("xorq %0,%0\n movq $-1,%1\n cmpq %3,%2\n cmovlq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc");
+  return crypto_int64_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int64 crypto_int64_z;
+  __asm__ ("cmp %1,%2\n csetm %0,lt" : "=r"(crypto_int64_z) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc");
+  return crypto_int64_z;
+#else
+  crypto_int64 crypto_int64_r = crypto_int64_x ^ crypto_int64_y;
+  crypto_int64 crypto_int64_z = crypto_int64_x - crypto_int64_y;
+  crypto_int64_z ^= crypto_int64_r & (crypto_int64_z ^ crypto_int64_x);
+  return crypto_int64_negative_mask(crypto_int64_z);
+#endif
+}
 
-  Generator(G,S);
-  HashShort(b,r);
-  Encrypt(B,T,r,G,A,b);
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_smaller_01(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int64 crypto_int64_q,crypto_int64_z;
+  __asm__ ("xorq %0,%0\n movq $1,%1\n cmpq %3,%2\n cmovlq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc");
+  return crypto_int64_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int64 crypto_int64_z;
+  __asm__ ("cmp %1,%2\n cset %0,lt" : "=r"(crypto_int64_z) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc");
+  return crypto_int64_z;
+#else
+  crypto_int64 crypto_int64_r = crypto_int64_x ^ crypto_int64_y;
+  crypto_int64 crypto_int64_z = crypto_int64_x - crypto_int64_y;
+  crypto_int64_z ^= crypto_int64_r & (crypto_int64_z ^ crypto_int64_x);
+  return crypto_int64_unsigned_topbit_01(crypto_int64_z);
+#endif
 }
 
-#define XDecrypt Decrypt
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_leq_mask(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int64 crypto_int64_q,crypto_int64_z;
+  __asm__ ("xorq %0,%0\n movq $-1,%1\n cmpq %3,%2\n cmovleq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc");
+  return crypto_int64_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int64 crypto_int64_z;
+  __asm__ ("cmp %1,%2\n csetm %0,le" : "=r"(crypto_int64_z) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc");
+  return crypto_int64_z;
+#else
+  return ~crypto_int64_smaller_mask(crypto_int64_y,crypto_int64_x);
+#endif
+}
 
+__attribute__((unused))
+static inline
+crypto_int64 crypto_int64_leq_01(crypto_int64 crypto_int64_x,crypto_int64 crypto_int64_y) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int64 crypto_int64_q,crypto_int64_z;
+  __asm__ ("xorq %0,%0\n movq $1,%1\n cmpq %3,%2\n cmovleq %1,%0" : "=&r"(crypto_int64_z), "=&r"(crypto_int64_q) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc");
+  return crypto_int64_z;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  crypto_int64 crypto_int64_z;
+  __asm__ ("cmp %1,%2\n cset %0,le" : "=r"(crypto_int64_z) : "r"(crypto_int64_x), "r"(crypto_int64_y) : "cc");
+  return crypto_int64_z;
+#else
+  return 1-crypto_int64_smaller_01(crypto_int64_y,crypto_int64_x);
 #endif
+}
 
-/* ----- encoding small polynomials (including short polynomials) */
+__attribute__((unused))
+static inline
+int crypto_int64_ones_num(crypto_int64 crypto_int64_x) {
+  crypto_int64_unsigned crypto_int64_y = crypto_int64_x;
+  const crypto_int64 C0 = 0x5555555555555555;
+  const crypto_int64 C1 = 0x3333333333333333;
+  const crypto_int64 C2 = 0x0f0f0f0f0f0f0f0f;
+  crypto_int64_y -= ((crypto_int64_y >> 1) & C0);
+  crypto_int64_y = (crypto_int64_y & C1) + ((crypto_int64_y >> 2) & C1);
+  crypto_int64_y = (crypto_int64_y + (crypto_int64_y >> 4)) & C2;
+  crypto_int64_y += crypto_int64_y >> 8;
+  crypto_int64_y += crypto_int64_y >> 16;
+  crypto_int64_y = (crypto_int64_y + (crypto_int64_y >> 32)) & 0xff;
+  return crypto_int64_y;
+}
+
+__attribute__((unused))
+static inline
+int crypto_int64_bottomzeros_num(crypto_int64 crypto_int64_x) {
+#if defined(__GNUC__) && defined(__x86_64__)
+  crypto_int64 fallback = 64;
+  __asm__ ("bsfq %0,%0\n cmoveq %1,%0" : "+&r"(crypto_int64_x) : "r"(fallback) : "cc");
+  return crypto_int64_x;
+#elif defined(__GNUC__) && defined(__aarch64__)
+  int64_t crypto_int64_y;
+  __asm__ ("rbit %0,%1\n clz %0,%0" : "=r"(crypto_int64_y) : "r"(crypto_int64_x) : );
+  return crypto_int64_y;
+#else
+  crypto_int64 crypto_int64_y = crypto_int64_x ^ (crypto_int64_x-1);
+  crypto_int64_y = ((crypto_int64) crypto_int64_y) >> 1;
+  crypto_int64_y &= ~(crypto_int64_x & (((crypto_int64) 1) << (64-1)));
+  return crypto_int64_ones_num(crypto_int64_y);
+#endif
+}
 
-#define Small_bytes ((p+3)/4)
+#endif
 
-/* these are the only functions that rely on p mod 4 = 1 */
+/* from supercop-20240808/crypto_sort/int32/portable4/sort.c */
+#define int32_MINMAX(a,b) crypto_int32_minmax(&a,&b)
 
-static void Small_encode(unsigned char *s,const small *f)
+static void crypto_sort_int32(void *array,long long n)
 {
-  small x;
-  int i;
+  long long top,p,q,r,i,j;
+  int32 *x = array;
 
-  for (i = 0;i < p/4;++i) {
-    x = *f++ + 1;
-    x += (*f++ + 1)<<2;
-    x += (*f++ + 1)<<4;
-    x += (*f++ + 1)<<6;
-    *s++ = x;
-  }
-  x = *f++ + 1;
-  *s++ = x;
-}
+  if (n < 2) return;
+  top = 1;
+  while (top < n - top) top += top;
 
-static void Small_decode(small *f,const unsigned char *s)
-{
-  unsigned char x;
-  int i;
+  for (p = top;p >= 1;p >>= 1) {
+    i = 0;
+    while (i + 2 * p <= n) {
+      for (j = i;j < i + p;++j)
+        int32_MINMAX(x[j],x[j+p]);
+      i += 2 * p;
+    }
+    for (j = i;j < n - p;++j)
+      int32_MINMAX(x[j],x[j+p]);
 
-  for (i = 0;i < p/4;++i) {
-    x = *s++;
-    *f++ = ((small)(x&3))-1; x >>= 2;
-    *f++ = ((small)(x&3))-1; x >>= 2;
-    *f++ = ((small)(x&3))-1; x >>= 2;
-    *f++ = ((small)(x&3))-1;
+    i = 0;
+    j = 0;
+    for (q = top;q > p;q >>= 1) {
+      if (j != i) for (;;) {
+        if (j == n - q) goto done;
+        int32 a = x[j + p];
+        for (r = q;r > p;r >>= 1)
+          int32_MINMAX(a,x[j + r]);
+        x[j + p] = a;
+        ++j;
+        if (j == i + p) {
+          i += 2 * p;
+          break;
+        }
+      }
+      while (i + p <= n - q) {
+        for (j = i;j < i + p;++j) {
+          int32 a = x[j + p];
+          for (r = q;r > p;r >>= 1)
+            int32_MINMAX(a,x[j+r]);
+          x[j + p] = a;
+        }
+        i += 2 * p;
+      }
+      /* now i + p > n - q */
+      j = i;
+      while (j < n - q) {
+        int32 a = x[j + p];
+        for (r = q;r > p;r >>= 1)
+          int32_MINMAX(a,x[j+r]);
+        x[j + p] = a;
+        ++j;
+      }
+
+      done: ;
+    }
   }
-  x = *s++;
-  *f++ = ((small)(x&3))-1;
 }
 
-/* ----- encoding general polynomials */
+/* from supercop-20240808/crypto_sort/uint32/useint32/sort.c */
 
-#ifndef LPR
+/* can save time by vectorizing xor loops */
+/* can save time by integrating xor loops with int32_sort */
 
-static void Rq_encode(unsigned char *s,const Fq *r)
+static void crypto_sort_uint32(void *array,long long n)
 {
-  uint16 R[p],M[p];
-  int i;
-
-  for (i = 0;i < p;++i) R[i] = r[i]+q12;
-  for (i = 0;i < p;++i) M[i] = q;
-  Encode(s,R,M,p);
+  crypto_uint32 *x = array;
+  long long j;
+  for (j = 0;j < n;++j) x[j] ^= 0x80000000;
+  crypto_sort_int32(array,n);
+  for (j = 0;j < n;++j) x[j] ^= 0x80000000;
 }
 
-static void Rq_decode(Fq *r,const unsigned char *s)
-{
-  uint16 R[p],M[p];
-  int i;
+/* from supercop-20240808/crypto_kem/sntrup761/compact/kem.c */
+// 20240806 djb: some automated conversion to cryptoint
 
-  for (i = 0;i < p;++i) M[i] = q;
-  Decode(R,s,M,p);
-  for (i = 0;i < p;++i) r[i] = ((Fq)R[i])-q12;
-}
+#define p 761
+#define q 4591
+#define w 286
+#define q12 ((q - 1) / 2)
+typedef int8_t small;
+typedef int16_t Fq;
+#define Hash_bytes 32
+#define Small_bytes ((p + 3) / 4)
+typedef small Inputs[p];
+#define SecretKeys_bytes (2 * Small_bytes)
+#define Confirm_bytes 32
 
-#endif
+static small F3_freeze(int16_t x) { return x - 3 * ((10923 * x + 16384) >> 15); }
+
+static Fq Fq_freeze(int32_t x) {
+  const int32_t q16 = (0x10000 + q / 2) / q;
+  const int32_t q20 = (0x100000 + q / 2) / q;
+  const int32_t q28 = (0x10000000 + q / 2) / q;
+  x -= q * ((q16 * x) >> 16);
+  x -= q * ((q20 * x) >> 20);
+  return x - q * ((q28 * x + 0x8000000) >> 28);
+}
+
+static int Weightw_mask(small *r) {
+  int i, weight = 0;
+  for (i = 0; i < p; ++i) weight += crypto_int64_bottombit_01(r[i]);
+  return crypto_int16_nonzero_mask(weight - w);
+}
+
+static void uint32_divmod_uint14(uint32_t *Q, uint16_t *r, uint32_t x, uint16_t m) {
+  uint32_t qpart, mask, v = 0x80000000 / m;
+  qpart = (x * (uint64_t)v) >> 31;
+  x -= qpart * m;
+  *Q = qpart;
+  qpart = (x * (uint64_t)v) >> 31;
+  x -= qpart * m;
+  *Q += qpart;
+  x -= m;
+  *Q += 1;
+  mask = crypto_int32_negative_mask(x);
+  x += mask & (uint32_t)m;
+  *Q += mask;
+  *r = x;
+}
 
-/* ----- encoding rounded polynomials */
+static uint16_t uint32_mod_uint14(uint32_t x, uint16_t m) {
+  uint32_t Q;
+  uint16_t r;
+  uint32_divmod_uint14(&Q, &r, x, m);
+  return r;
+}
 
-static void Rounded_encode(unsigned char *s,const Fq *r)
-{
-  uint16 R[p],M[p];
-  int i;
+static void Encode(unsigned char *out, const uint16_t *R, const uint16_t *M, long long len) {
+  if (len == 1) {
+    uint16_t r = R[0], m = M[0];
+    while (m > 1) {
+      *out++ = r;
+      r >>= 8;
+      m = (m + 255) >> 8;
+    }
+  }
+  if (len > 1) {
+    uint16_t R2[(len + 1) / 2], M2[(len + 1) / 2];
+    long long i;
+    for (i = 0; i < len - 1; i += 2) {
+      uint32_t m0 = M[i];
+      uint32_t r = R[i] + R[i + 1] * m0;
+      uint32_t m = M[i + 1] * m0;
+      while (m >= 16384) {
+        *out++ = r;
+        r >>= 8;
+        m = (m + 255) >> 8;
+      }
+      R2[i / 2] = r;
+      M2[i / 2] = m;
+    }
+    if (i < len) {
+      R2[i / 2] = R[i];
+      M2[i / 2] = M[i];
+    }
+    Encode(out, R2, M2, (len + 1) / 2);
+  }
+}
 
-  for (i = 0;i < p;++i) R[i] = ((r[i]+q12)*10923)>>15;
-  for (i = 0;i < p;++i) M[i] = (q+2)/3;
-  Encode(s,R,M,p);
+static void Decode(uint16_t *out, const unsigned char *S, const uint16_t *M, long long len) {
+  if (len == 1) {
+    if (M[0] == 1)
+      *out = 0;
+    else if (M[0] <= 256)
+      *out = uint32_mod_uint14(S[0], M[0]);
+    else
+      *out = uint32_mod_uint14(S[0] + (((uint16_t)S[1]) << 8), M[0]);
+  }
+  if (len > 1) {
+    uint16_t R2[(len + 1) / 2], M2[(len + 1) / 2], bottomr[len / 2];
+    uint32_t bottomt[len / 2];
+    long long i;
+    for (i = 0; i < len - 1; i += 2) {
+      uint32_t m = M[i] * (uint32_t)M[i + 1];
+      if (m > 256 * 16383) {
+        bottomt[i / 2] = 256 * 256;
+        bottomr[i / 2] = S[0] + 256 * S[1];
+        S += 2;
+        M2[i / 2] = (((m + 255) >> 8) + 255) >> 8;
+      } else if (m >= 16384) {
+        bottomt[i / 2] = 256;
+        bottomr[i / 2] = S[0];
+        S += 1;
+        M2[i / 2] = (m + 255) >> 8;
+      } else {
+        bottomt[i / 2] = 1;
+        bottomr[i / 2] = 0;
+        M2[i / 2] = m;
+      }
+    }
+    if (i < len) M2[i / 2] = M[i];
+    Decode(R2, S, M2, (len + 1) / 2);
+    for (i = 0; i < len - 1; i += 2) {
+      uint32_t r1, r = bottomr[i / 2];
+      uint16_t r0;
+      r += bottomt[i / 2] * R2[i / 2];
+      uint32_divmod_uint14(&r1, &r0, r, M[i]);
+      r1 = uint32_mod_uint14(r1, M[i + 1]);
+      *out++ = r0;
+      *out++ = r1;
+    }
+    if (i < len) *out++ = R2[i / 2];
+  }
 }
 
-static void Rounded_decode(Fq *r,const unsigned char *s)
-{
-  uint16 R[p],M[p];
+static void R3_fromRq(small *out, const Fq *r) {
   int i;
-
-  for (i = 0;i < p;++i) M[i] = (q+2)/3;
-  Decode(R,s,M,p);
-  for (i = 0;i < p;++i) r[i] = R[i]*3-q12;
+  for (i = 0; i < p; ++i) out[i] = F3_freeze(r[i]);
 }
 
-/* ----- encoding top polynomials */
+static void R3_mult(small *h, const small *f, const small *g) {
+  int16_t fg[p + p - 1];
+  int i, j;
+  for (i = 0; i < p + p - 1; ++i) fg[i] = 0;
+  for (i = 0; i < p; ++i)
+    for (j = 0; j < p; ++j) fg[i + j] += f[i] * (int16_t)g[j];
+  for (i = p; i < p + p - 1; ++i) fg[i - p] += fg[i];
+  for (i = p; i < p + p - 1; ++i) fg[i - p + 1] += fg[i];
+  for (i = 0; i < p; ++i) h[i] = F3_freeze(fg[i]);
+}
 
-#ifdef LPR
+static int R3_recip(small *out, const small *in) {
+  small f[p + 1], g[p + 1], v[p + 1], r[p + 1];
+  int sign, swap, t, i, loop, delta = 1;
+  for (i = 0; i < p + 1; ++i) v[i] = 0;
+  for (i = 0; i < p + 1; ++i) r[i] = 0;
+  r[0] = 1;
+  for (i = 0; i < p; ++i) f[i] = 0;
+  f[0] = 1;
+  f[p - 1] = f[p] = -1;
+  for (i = 0; i < p; ++i) g[p - 1 - i] = in[i];
+  g[p] = 0;
+  for (loop = 0; loop < 2 * p - 1; ++loop) {
+    for (i = p; i > 0; --i) v[i] = v[i - 1];
+    v[0] = 0;
+    sign = -g[0] * f[0];
+    swap = crypto_int16_negative_mask(-delta) & crypto_int16_nonzero_mask(g[0]);
+    delta ^= swap & (delta ^ -delta);
+    delta += 1;
+    for (i = 0; i < p + 1; ++i) {
+      t = swap & (f[i] ^ g[i]);
+      f[i] ^= t;
+      g[i] ^= t;
+      t = swap & (v[i] ^ r[i]);
+      v[i] ^= t;
+      r[i] ^= t;
+    }
+    for (i = 0; i < p + 1; ++i) g[i] = F3_freeze(g[i] + sign * f[i]);
+    for (i = 0; i < p + 1; ++i) r[i] = F3_freeze(r[i] + sign * v[i]);
+    for (i = 0; i < p; ++i) g[i] = g[i + 1];
+    g[p] = 0;
+  }
+  sign = f[0];
+  for (i = 0; i < p; ++i) out[i] = sign * v[p - 1 - i];
+  return crypto_int16_nonzero_mask(delta);
+}
 
-#define Top_bytes (I/2)
+static void Rq_mult_small(Fq *h, const Fq *f, const small *g) {
+  int32_t fg[p + p - 1];
+  int i, j;
+  for (i = 0; i < p + p - 1; ++i) fg[i] = 0;
+  for (i = 0; i < p; ++i)
+    for (j = 0; j < p; ++j) fg[i + j] += f[i] * (int32_t)g[j];
+  for (i = p; i < p + p - 1; ++i) fg[i - p] += fg[i];
+  for (i = p; i < p + p - 1; ++i) fg[i - p + 1] += fg[i];
+  for (i = 0; i < p; ++i) h[i] = Fq_freeze(fg[i]);
+}
 
-static void Top_encode(unsigned char *s,const int8 *T)
-{
+static void Rq_mult3(Fq *h, const Fq *f) {
   int i;
-  for (i = 0;i < Top_bytes;++i)
-    s[i] = T[2*i]+(T[2*i+1]<<4);
+  for (i = 0; i < p; ++i) h[i] = Fq_freeze(3 * f[i]);
 }
 
-static void Top_decode(int8 *T,const unsigned char *s)
-{
-  int i;
-  for (i = 0;i < Top_bytes;++i) {
-    T[2*i] = s[i]&15;
-    T[2*i+1] = s[i]>>4;
+static Fq Fq_recip(Fq a1) {
+  int i = 1;
+  Fq ai = a1;
+  while (i < q - 2) {
+    ai = Fq_freeze(a1 * (int32_t)ai);
+    i += 1;
   }
+  return ai;
 }
 
-#endif
-
-/* ----- Streamlined NTRU Prime Core plus encoding */
-
-#ifndef LPR
-
-typedef small Inputs[p]; /* passed by reference */
-#define Inputs_random Short_random
-#define Inputs_encode Small_encode
-#define Inputs_bytes Small_bytes
-
-#define Ciphertexts_bytes Rounded_bytes
-#define SecretKeys_bytes (2*Small_bytes)
-#define PublicKeys_bytes Rq_bytes
-
-/* pk,sk = ZKeyGen() */
-static void ZKeyGen(unsigned char *pk,unsigned char *sk)
-{
-  Fq h[p];
-  small f[p],v[p];
-
-  KeyGen(h,f,v);
-  Rq_encode(pk,h);
-  Small_encode(sk,f); sk += Small_bytes;
-  Small_encode(sk,v);
+static int Rq_recip3(Fq *out, const small *in) {
+  Fq f[p + 1], g[p + 1], v[p + 1], r[p + 1], scale;
+  int swap, t, i, loop, delta = 1;
+  int32_t f0, g0;
+  for (i = 0; i < p + 1; ++i) v[i] = 0;
+  for (i = 0; i < p + 1; ++i) r[i] = 0;
+  r[0] = Fq_recip(3);
+  for (i = 0; i < p; ++i) f[i] = 0;
+  f[0] = 1;
+  f[p - 1] = f[p] = -1;
+  for (i = 0; i < p; ++i) g[p - 1 - i] = in[i];
+  g[p] = 0;
+  for (loop = 0; loop < 2 * p - 1; ++loop) {
+    for (i = p; i > 0; --i) v[i] = v[i - 1];
+    v[0] = 0;
+    swap = crypto_int16_negative_mask(-delta) & crypto_int16_nonzero_mask(g[0]);
+    delta ^= swap & (delta ^ -delta);
+    delta += 1;
+    for (i = 0; i < p + 1; ++i) {
+      t = swap & (f[i] ^ g[i]);
+      f[i] ^= t;
+      g[i] ^= t;
+      t = swap & (v[i] ^ r[i]);
+      v[i] ^= t;
+      r[i] ^= t;
+    }
+    f0 = f[0];
+    g0 = g[0];
+    for (i = 0; i < p + 1; ++i) g[i] = Fq_freeze(f0 * g[i] - g0 * f[i]);
+    for (i = 0; i < p + 1; ++i) r[i] = Fq_freeze(f0 * r[i] - g0 * v[i]);
+    for (i = 0; i < p; ++i) g[i] = g[i + 1];
+    g[p] = 0;
+  }
+  scale = Fq_recip(f[0]);
+  for (i = 0; i < p; ++i) out[i] = Fq_freeze(scale * (int32_t)v[p - 1 - i]);
+  return crypto_int16_nonzero_mask(delta);
 }
 
-/* C = ZEncrypt(r,pk) */
-static void ZEncrypt(unsigned char *C,const Inputs r,const unsigned char *pk)
-{
-  Fq h[p];
-  Fq c[p];
-  Rq_decode(h,pk);
-  Encrypt(c,r,h);
-  Rounded_encode(C,c);
+static void Round(Fq *out, const Fq *a) {
+  int i;
+  for (i = 0; i < p; ++i) out[i] = a[i] - F3_freeze(a[i]);
 }
 
-/* r = ZDecrypt(C,sk) */
-static void ZDecrypt(Inputs r,const unsigned char *C,const unsigned char *sk)
-{
-  small f[p],v[p];
-  Fq c[p];
-
-  Small_decode(f,sk); sk += Small_bytes;
-  Small_decode(v,sk);
-  Rounded_decode(c,C);
-  Decrypt(r,c,f,v);
+static void Short_fromlist(small *out, const uint32_t *in) {
+  uint32_t L[p];
+  int i;
+  for (i = 0; i < w; ++i) L[i] = in[i] & (uint32_t)-2;
+  for (i = w; i < p; ++i) L[i] = (in[i] & (uint32_t)-3) | 1;
+  crypto_sort_uint32(L, p);
+  for (i = 0; i < p; ++i) out[i] = (L[i] & 3) - 1;
 }
 
-#endif
-
-/* ----- NTRU LPRime Expand plus encoding */
-
-#ifdef LPR
-
-#define Ciphertexts_bytes (Rounded_bytes+Top_bytes)
-#define SecretKeys_bytes Small_bytes
-#define PublicKeys_bytes (Seeds_bytes+Rounded_bytes)
-
-static void Inputs_random(Inputs r)
-{
-  unsigned char s[Inputs_bytes];
+static void Hash_prefix(unsigned char *out, int b, const unsigned char *in, int inlen) {
+  unsigned char x[inlen + 1], h[64];
   int i;
-
-  randombytes(s,sizeof s);
-  for (i = 0;i < I;++i) r[i] = 1&(s[i>>3]>>(i&7));
+  x[0] = b;
+  for (i = 0; i < inlen; ++i) x[i + 1] = in[i];
+  crypto_hash_sha512(h, x, inlen + 1);
+  for (i = 0; i < 32; ++i) out[i] = h[i];
 }
 
-/* pk,sk = ZKeyGen() */
-static void ZKeyGen(unsigned char *pk,unsigned char *sk)
-{
-  Fq A[p];
-  small a[p];
-
-  XKeyGen(pk,A,a); pk += Seeds_bytes;
-  Rounded_encode(pk,A);
-  Small_encode(sk,a);
+static uint32_t urandom32(void) {
+  unsigned char c[4];
+  uint32_t result = 0;
+  int i;
+  randombytes(c, 4);
+  for (i = 0; i < 4; ++i) result += ((uint32_t)c[i]) << (8 * i);
+  return result;
 }
 
-/* c = ZEncrypt(r,pk) */
-static void ZEncrypt(unsigned char *c,const Inputs r,const unsigned char *pk)
-{
-  Fq A[p];
-  Fq B[p];
-  int8 T[I];
-
-  Rounded_decode(A,pk+Seeds_bytes);
-  XEncrypt(B,T,r,pk,A);
-  Rounded_encode(c,B); c += Rounded_bytes;
-  Top_encode(c,T);
+static void Short_random(small *out) {
+  uint32_t L[p];
+  int i;
+  for (i = 0; i < p; ++i) L[i] = urandom32();
+  Short_fromlist(out, L);
 }
 
-/* r = ZDecrypt(C,sk) */
-static void ZDecrypt(Inputs r,const unsigned char *c,const unsigned char *sk)
-{
-  small a[p];
-  Fq B[p];
-  int8 T[I];
-
-  Small_decode(a,sk);
-  Rounded_decode(B,c);
-  Top_decode(T,c+Rounded_bytes);
-  XDecrypt(r,B,T,a);
+static void Small_random(small *out) {
+  int i;
+  for (i = 0; i < p; ++i) out[i] = (((urandom32() & 0x3fffffff) * 3) >> 30) - 1;
 }
 
-#endif
-
-/* ----- confirmation hash */
+static void KeyGen(Fq *h, small *f, small *ginv) {
+  small g[p];
+  Fq finv[p];
+  for (;;) {
+    int result;
+    Small_random(g);
+    result = R3_recip(ginv, g);
+    crypto_declassify(&result, sizeof result);
+    if (result == 0) break;
+  }
+  Short_random(f);
+  Rq_recip3(finv, f);
+  Rq_mult_small(h, finv, g);
+}
 
-#define Confirm_bytes 32
+static void Encrypt(Fq *c, const small *r, const Fq *h) {
+  Fq hr[p];
+  Rq_mult_small(hr, h, r);
+  Round(c, hr);
+}
+
+static void Decrypt(small *r, const Fq *c, const small *f, const small *ginv) {
+  Fq cf[p], cf3[p];
+  small e[p], ev[p];
+  int mask, i;
+  Rq_mult_small(cf, c, f);
+  Rq_mult3(cf3, cf);
+  R3_fromRq(e, cf3);
+  R3_mult(ev, e, ginv);
+  mask = Weightw_mask(ev);
+  for (i = 0; i < w; ++i) r[i] = ((ev[i] ^ 1) & ~mask) ^ 1;
+  for (i = w; i < p; ++i) r[i] = ev[i] & ~mask;
+}
+
+static void Small_encode(unsigned char *s, const small *f) {
+  int i, j;
+  for (i = 0; i < p / 4; ++i) {
+    small x = 0;
+    for (j = 0;j < 4;++j) x += (*f++ + 1) << (2 * j);
+    *s++ = x;
+  }
+  *s = *f++ + 1;
+}
 
-/* h = HashConfirm(r,pk,cache); cache is Hash4(pk) */
-static void HashConfirm(unsigned char *h,const unsigned char *r,const unsigned char *pk,const unsigned char *cache)
-{
-#ifndef LPR
-  unsigned char x[Hash_bytes*2];
-  int i;
+static void Small_decode(small *f, const unsigned char *s) {
+  int i, j;
+  for (i = 0; i < p / 4; ++i) {
+    unsigned char x = *s++;
+    for (j = 0;j < 4;++j) *f++ = ((small)((x >> (2 * j)) & 3)) - 1;
+  }
+  *f++ = ((small)(*s & 3)) - 1;
+}
 
-  Hash_prefix(x,3,r,Inputs_bytes);
-  for (i = 0;i < Hash_bytes;++i) x[Hash_bytes+i] = cache[i];
-#else
-  unsigned char x[Inputs_bytes+Hash_bytes];
+static void Rq_encode(unsigned char *s, const Fq *r) {
+  uint16_t R[p], M[p];
   int i;
-
-  for (i = 0;i < Inputs_bytes;++i) x[i] = r[i];
-  for (i = 0;i < Hash_bytes;++i) x[Inputs_bytes+i] = cache[i];
-#endif
-  Hash_prefix(h,2,x,sizeof x);
+  for (i = 0; i < p; ++i) R[i] = r[i] + q12;
+  for (i = 0; i < p; ++i) M[i] = q;
+  Encode(s, R, M, p);
 }
 
-/* ----- session-key hash */
-
-/* k = HashSession(b,y,z) */
-static void HashSession(unsigned char *k,int b,const unsigned char *y,const unsigned char *z)
-{
-#ifndef LPR
-  unsigned char x[Hash_bytes+Ciphertexts_bytes+Confirm_bytes];
+static void Rq_decode(Fq *r, const unsigned char *s) {
+  uint16_t R[p], M[p];
   int i;
+  for (i = 0; i < p; ++i) M[i] = q;
+  Decode(R, s, M, p);
+  for (i = 0; i < p; ++i) r[i] = ((Fq)R[i]) - q12;
+}
 
-  Hash_prefix(x,3,y,Inputs_bytes);
-  for (i = 0;i < Ciphertexts_bytes+Confirm_bytes;++i) x[Hash_bytes+i] = z[i];
-#else
-  unsigned char x[Inputs_bytes+Ciphertexts_bytes+Confirm_bytes];
+static void Rounded_encode(unsigned char *s, const Fq *r) {
+  uint16_t R[p], M[p];
   int i;
-
-  for (i = 0;i < Inputs_bytes;++i) x[i] = y[i];
-  for (i = 0;i < Ciphertexts_bytes+Confirm_bytes;++i) x[Inputs_bytes+i] = z[i];
-#endif
-  Hash_prefix(k,b,x,sizeof x);
+  for (i = 0; i < p; ++i) R[i] = ((r[i] + q12) * 10923) >> 15;
+  for (i = 0; i < p; ++i) M[i] = (q + 2) / 3;
+  Encode(s, R, M, p);
 }
 
-/* ----- Streamlined NTRU Prime and NTRU LPRime */
-
-/* pk,sk = KEM_KeyGen() */
-static void KEM_KeyGen(unsigned char *pk,unsigned char *sk)
-{
+static void Rounded_decode(Fq *r, const unsigned char *s) {
+  uint16_t R[p], M[p];
   int i;
-
-  ZKeyGen(pk,sk); sk += SecretKeys_bytes;
-  for (i = 0;i < PublicKeys_bytes;++i) *sk++ = pk[i];
-  randombytes(sk,Inputs_bytes); sk += Inputs_bytes;
-  Hash_prefix(sk,4,pk,PublicKeys_bytes);
+  for (i = 0; i < p; ++i) M[i] = (q + 2) / 3;
+  Decode(R, s, M, p);
+  for (i = 0; i < p; ++i) r[i] = R[i] * 3 - q12;
 }
 
-/* c,r_enc = Hide(r,pk,cache); cache is Hash4(pk) */
-static void Hide(unsigned char *c,unsigned char *r_enc,const Inputs r,const unsigned char *pk,const unsigned char *cache)
-{
-  Inputs_encode(r_enc,r);
-  ZEncrypt(c,r,pk); c += Ciphertexts_bytes;
-  HashConfirm(c,r_enc,pk,cache);
+static void ZKeyGen(unsigned char *pk, unsigned char *sk) {
+  Fq h[p];
+  small f[p], v[p];
+  KeyGen(h, f, v);
+  Rq_encode(pk, h);
+  Small_encode(sk, f);
+  Small_encode(sk + Small_bytes, v);
 }
 
-/* c,k = Encap(pk) */
-static void Encap(unsigned char *c,unsigned char *k,const unsigned char *pk)
-{
-  Inputs r;
-  unsigned char r_enc[Inputs_bytes];
-  unsigned char cache[Hash_bytes];
-
-  Hash_prefix(cache,4,pk,PublicKeys_bytes);
-  Inputs_random(r);
-  Hide(c,r_enc,r,pk,cache);
-  HashSession(k,1,r_enc,c);
+static void ZEncrypt(unsigned char *C, const Inputs r, const unsigned char *pk) {
+  Fq h[p], c[p];
+  Rq_decode(h, pk);
+  Encrypt(c, r, h);
+  Rounded_encode(C, c);
 }
 
-/* 0 if matching ciphertext+confirm, else -1 */
-static int Ciphertexts_diff_mask(const unsigned char *c,const unsigned char *c2)
-{
-  uint16 differentbits = 0;
-  int len = Ciphertexts_bytes+Confirm_bytes;
-
-  while (len-- > 0) differentbits |= (*c++)^(*c2++);
-  return (1&((differentbits-1)>>8))-1;
+static void ZDecrypt(Inputs r, const unsigned char *C, const unsigned char *sk) {
+  small f[p], v[p];
+  Fq c[p];
+  Small_decode(f, sk);
+  Small_decode(v, sk + Small_bytes);
+  Rounded_decode(c, C);
+  Decrypt(r, c, f, v);
 }
 
-/* k = Decap(c,sk) */
-static void Decap(unsigned char *k,const unsigned char *c,const unsigned char *sk)
-{
-  const unsigned char *pk = sk + SecretKeys_bytes;
-  const unsigned char *rho = pk + PublicKeys_bytes;
-  const unsigned char *cache = rho + Inputs_bytes;
-  Inputs r;
-  unsigned char r_enc[Inputs_bytes];
-  unsigned char cnew[Ciphertexts_bytes+Confirm_bytes];
-  int mask;
+static void HashConfirm(unsigned char *h, const unsigned char *r, const unsigned char *cache) {
+  unsigned char x[Hash_bytes * 2];
   int i;
+  Hash_prefix(x, 3, r, Small_bytes);
+  for (i = 0; i < Hash_bytes; ++i) x[Hash_bytes + i] = cache[i];
+  Hash_prefix(h, 2, x, sizeof x);
+}
 
-  ZDecrypt(r,c,sk);
-  Hide(cnew,r_enc,r,pk,cache);
-  mask = Ciphertexts_diff_mask(c,cnew);
-  for (i = 0;i < Inputs_bytes;++i) r_enc[i] ^= mask&(r_enc[i]^rho[i]);
-  HashSession(k,1+mask,r_enc,c);
+static void HashSession(unsigned char *k, int b, const unsigned char *y, const unsigned char *z) {
+  unsigned char x[Hash_bytes + crypto_kem_sntrup761_CIPHERTEXTBYTES];
+  int i;
+  Hash_prefix(x, 3, y, Small_bytes);
+  for (i = 0; i < crypto_kem_sntrup761_CIPHERTEXTBYTES; ++i) x[Hash_bytes + i] = z[i];
+  Hash_prefix(k, b, x, sizeof x);
 }
 
-/* ----- crypto_kem API */
+int crypto_kem_sntrup761_keypair(unsigned char *pk, unsigned char *sk) {
+  int i;
+  ZKeyGen(pk, sk);
+  sk += SecretKeys_bytes;
+  for (i = 0; i < crypto_kem_sntrup761_PUBLICKEYBYTES; ++i) *sk++ = pk[i];
+  randombytes(sk, Small_bytes);
+  Hash_prefix(sk + Small_bytes, 4, pk, crypto_kem_sntrup761_PUBLICKEYBYTES);
+  return 0;
+}
 
+static void Hide(unsigned char *c, unsigned char *r_enc, const Inputs r, const unsigned char *pk, const unsigned char *cache) {
+  Small_encode(r_enc, r);
+  ZEncrypt(c, r, pk);
+  HashConfirm(c + crypto_kem_sntrup761_CIPHERTEXTBYTES - Confirm_bytes, r_enc, cache);
+}
 
-int crypto_kem_sntrup761_keypair(unsigned char *pk,unsigned char *sk)
-{
-  KEM_KeyGen(pk,sk);
+int crypto_kem_sntrup761_enc(unsigned char *c, unsigned char *k, const unsigned char *pk) {
+  Inputs r;
+  unsigned char r_enc[Small_bytes], cache[Hash_bytes];
+  Hash_prefix(cache, 4, pk, crypto_kem_sntrup761_PUBLICKEYBYTES);
+  Short_random(r);
+  Hide(c, r_enc, r, pk, cache);
+  HashSession(k, 1, r_enc, c);
   return 0;
 }
 
-int crypto_kem_sntrup761_enc(unsigned char *c,unsigned char *k,const unsigned char *pk)
-{
-  Encap(c,k,pk);
-  return 0;
+static int Ciphertexts_diff_mask(const unsigned char *c, const unsigned char *c2) {
+  uint16_t differentbits = 0;
+  int len = crypto_kem_sntrup761_CIPHERTEXTBYTES;
+  while (len-- > 0) differentbits |= (*c++) ^ (*c2++);
+  return (crypto_int64_bitmod_01((differentbits - 1),8)) - 1;
 }
 
-int crypto_kem_sntrup761_dec(unsigned char *k,const unsigned char *c,const unsigned char *sk)
-{
-  Decap(k,c,sk);
+int crypto_kem_sntrup761_dec(unsigned char *k, const unsigned char *c, const unsigned char *sk) {
+  const unsigned char *pk = sk + SecretKeys_bytes;
+  const unsigned char *rho = pk + crypto_kem_sntrup761_PUBLICKEYBYTES;
+  const unsigned char *cache = rho + Small_bytes;
+  Inputs r;
+  unsigned char r_enc[Small_bytes], cnew[crypto_kem_sntrup761_CIPHERTEXTBYTES];
+  int mask, i;
+  ZDecrypt(r, c, sk);
+  Hide(cnew, r_enc, r, pk, cache);
+  mask = Ciphertexts_diff_mask(c, cnew);
+  for (i = 0; i < Small_bytes; ++i) r_enc[i] ^= mask & (r_enc[i] ^ rho[i]);
+  HashSession(k, 1 + mask, r_enc, c);
   return 0;
 }
 
diff --git a/usr.bin/ssh/sntrup761.sh b/usr.bin/ssh/sntrup761.sh
index db4e9aed08a..92c803bb1a4 100644
--- a/usr.bin/ssh/sntrup761.sh
+++ b/usr.bin/ssh/sntrup761.sh
@@ -1,25 +1,18 @@
 #!/bin/sh
-#       $OpenBSD: sntrup761.sh,v 1.7 2023/01/11 02:13:52 djm Exp $
+#       $OpenBSD: sntrup761.sh,v 1.8 2024/09/15 02:20:51 djm Exp $
 #       Placed in the Public Domain.
 #
-AUTHOR="supercop-20201130/crypto_kem/sntrup761/ref/implementors"
-FILES="
-	supercop-20201130/crypto_sort/int32/portable4/int32_minmax.inc
-	supercop-20201130/crypto_sort/int32/portable4/sort.c
-	supercop-20201130/crypto_sort/uint32/useint32/sort.c
-	supercop-20201130/crypto_kem/sntrup761/ref/uint32.c
-	supercop-20201130/crypto_kem/sntrup761/ref/int32.c
-	supercop-20201130/crypto_kem/sntrup761/ref/paramsmenu.h
-	supercop-20201130/crypto_kem/sntrup761/ref/params.h
-	supercop-20201130/crypto_kem/sntrup761/ref/Decode.h
-	supercop-20201130/crypto_kem/sntrup761/ref/Decode.c
-	supercop-20201130/crypto_kem/sntrup761/ref/Encode.h
-	supercop-20201130/crypto_kem/sntrup761/ref/Encode.c
-	supercop-20201130/crypto_kem/sntrup761/ref/kem.c
+AUTHOR="supercop-20240808/crypto_kem/sntrup761/ref/implementors"
+FILES=" supercop-20240808/cryptoint/crypto_int16.h
+	supercop-20240808/cryptoint/crypto_int32.h
+	supercop-20240808/cryptoint/crypto_int64.h
+	supercop-20240808/crypto_sort/int32/portable4/sort.c
+	supercop-20240808/crypto_sort/uint32/useint32/sort.c
+	supercop-20240808/crypto_kem/sntrup761/compact/kem.c
 "
 ###
 
-set -e
+set -euo pipefail
 cd $1
 echo -n '/*  $'
 echo 'OpenBSD: $ */'
@@ -32,12 +25,19 @@ echo
 echo '#include <string.h>'
 echo '#include "crypto_api.h"'
 echo
+echo '#define crypto_declassify(x, y) do {} while (0)'
+echo
 # Map the types used in this code to the ones in crypto_api.h.  We use #define
 # instead of typedef since some systems have existing intXX types and do not
 # permit multiple typedefs even if they do not conflict.
 for t in int8 uint8 int16 uint16 int32 uint32 int64 uint64; do
 	echo "#define $t crypto_${t}"
 done
+
+for x in 16 32 64 ; do
+	echo "extern volatile crypto_int$x crypto_int${x}_optblocker;"
+done
+
 echo
 for i in $FILES; do
 	echo "/* from $i */"
@@ -57,14 +57,27 @@ for i in $FILES; do
 	    -e 's/[	 ]*$//' \
 	    $i | \
 	case "$i" in
-	# Use int64_t for intermediate values in int32_MINMAX to prevent signed
-	# 32-bit integer overflow when called by crypto_sort_uint32.
-	*/int32_minmax.inc)
-	    sed -e "s/int32 ab = b ^ a/int64_t ab = (int64_t)b ^ (int64_t)a/" \
-	        -e "s/int32 c = b - a/int64_t c = (int64_t)b - (int64_t)a/"
+	*/cryptoint/crypto_int16.h)
+	    sed -e "s/static void crypto_int16_store/void crypto_int16_store/" \
+		-e "s/^[#]define crypto_int16_optblocker.*//" \
+	        -e "s/static void crypto_int16_minmax/void crypto_int16_minmax/"
+	    ;;
+	*/cryptoint/crypto_int32.h)
+	    sed -e "s/static void crypto_int32_store/void crypto_int32_store/" \
+		-e "s/^[#]define crypto_int32_optblocker.*//" \
+	        -e "s/static void crypto_int32_minmax/void crypto_int32_minmax/"
+	    ;;
+	*/cryptoint/crypto_int64.h)
+	    sed -e "s/static void crypto_int64_store/void crypto_int64_store/" \
+		-e "s/^[#]define crypto_int64_optblocker.*//" \
+	        -e "s/static void crypto_int64_minmax/void crypto_int64_minmax/"
 	    ;;
 	*/int32/portable4/sort.c)
-	    sed -e "s/void crypto_sort/void crypto_sort_int32/g"
+	    sed -e "s/void crypto_sort[(]/void crypto_sort_int32(/g"
+	    ;;
+	*/int32/portable5/sort.c)
+	    sed -e "s/crypto_sort_smallindices/crypto_sort_int32_smallindices/"\
+	        -e "s/void crypto_sort[(]/void crypto_sort_int32(/g"
 	    ;;
 	*/uint32/useint32/sort.c)
 	    sed -e "s/void crypto_sort/void crypto_sort_uint32/g"
-- 
2.20.1