montana/iOS/liboqs/scripts/copy_from_upstream/patches/pqclean-kyber-armneon-768-1024-fixes.patch

284 lines
9.8 KiB
Diff
Raw Permalink Normal View History

diff --git a/crypto_kem/kyber1024/aarch64/indcpa.c b/crypto_kem/kyber1024/aarch64/indcpa.c
index 6b83943e..43f489f0 100644
--- a/crypto_kem/kyber1024/aarch64/indcpa.c
+++ b/crypto_kem/kyber1024/aarch64/indcpa.c
@@ -160,39 +160,44 @@ static void unpack_ciphertext(int16_t b[KYBER_K][KYBER_N], int16_t *v, const uin
**************************************************/
#define GEN_MATRIX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
// Not static for benchmarking
-void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_SYMBYTES], int transposed) {
- unsigned int ctr0, ctr1, k;
- unsigned int buflen, off;
- uint8_t buf0[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2],
- buf1[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2];
- neon_xof_state state;
-
- for (unsigned int i = 0; i < KYBER_K; i++) {
- if (transposed) {
- neon_xof_absorb(&state, seed, i, i, 0, 1);
- } else {
- neon_xof_absorb(&state, seed, 0, 1, i, i);
+void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_SYMBYTES], int transposed)
+{
+ unsigned int ctr0, ctr1, k;
+ unsigned int buflen, off;
+ uint8_t buf0[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2],
+ buf1[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2];
+ neon_xof_state state;
+
+ for (unsigned int i = 0; i < KYBER_K; i++)
+ {
+ for (unsigned int j = 0; j < KYBER_K; j += 2)
+ {
+ if (transposed)
+ neon_xof_absorb(&state, seed, i, i, j, j + 1);
+ else
+ neon_xof_absorb(&state, seed, j, j + 1, i, i);
+
+ neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state);
+ buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES;
+ ctr0 = neon_rej_uniform(&(a[i][j][0]), buf0);
+ ctr1 = neon_rej_uniform(&(a[i][j + 1][0]), buf1);
+
+ while (ctr0 < KYBER_N || ctr1 < KYBER_N)
+ {
+ off = buflen % 3;
+ for (k = 0; k < off; k++)
+ {
+ buf0[k] = buf0[buflen - off + k];
+ buf1[k] = buf1[buflen - off + k];
}
+ neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state);
- neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state);
-
- buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES;
-
- ctr0 = neon_rej_uniform(&(a[i][0][0]), buf0);
- ctr1 = neon_rej_uniform(&(a[i][1][0]), buf1);
- while (ctr0 < KYBER_N || ctr1 < KYBER_N) {
- off = buflen % 3;
- for (k = 0; k < off; k++) {
- buf0[k] = buf0[buflen - off + k];
- buf1[k] = buf1[buflen - off + k];
- }
- neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state);
-
- buflen = off + XOF_BLOCKBYTES;
- ctr0 += rej_uniform(&(a[i][0][0]) + ctr0, KYBER_N - ctr0, buf0, buflen);
- ctr1 += rej_uniform(&(a[i][1][0]) + ctr1, KYBER_N - ctr1, buf1, buflen);
- }
+ buflen = off + XOF_BLOCKBYTES;
+ ctr0 += rej_uniform(&(a[i][j][0]) + ctr0, KYBER_N - ctr0, buf0, buflen);
+ ctr1 += rej_uniform(&(a[i][j + 1][0]) + ctr1, KYBER_N - ctr1, buf1, buflen);
+ }
}
+ }
}
/*************************************************
@@ -224,7 +229,9 @@ void indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
gen_a(a, publicseed);
neon_poly_getnoise_eta1_2x(&(skpv[0][0]), &(skpv[1][0]), noiseseed, 0, 1);
- neon_poly_getnoise_eta1_2x(&(e[0][0]), &(e[1][0]), noiseseed, 2, 3);
+ neon_poly_getnoise_eta1_2x(&(skpv[2][0]), &(skpv[3][0]), noiseseed, 2, 3);
+ neon_poly_getnoise_eta1_2x(&(e[0][0]), &(e[1][0]), noiseseed, 4, 5);
+ neon_poly_getnoise_eta1_2x(&(e[2][0]), &(e[3][0]), noiseseed, 6, 7);
neon_polyvec_ntt(skpv);
neon_polyvec_ntt(e);
@@ -280,10 +287,11 @@ void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
poly_frommsg(k, m);
gen_at(at, seed);
- // ETA1 != ETA2 (3 != 2)
- neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1);
- neon_poly_getnoise_eta2_2x(&(ep[0][0]), &(ep[1][0]), coins, 2, 3);
- neon_poly_getnoise_eta2(&(epp[0]), coins, 4);
+ neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1);
+ neon_poly_getnoise_eta1_2x(&(sp[2][0]), &(sp[3][0]), coins, 2, 3);
+ neon_poly_getnoise_eta1_2x(&(ep[0][0]), &(ep[1][0]), coins, 4, 5);
+ neon_poly_getnoise_eta1_2x(&(ep[2][0]), &(ep[3][0]), coins, 6, 7);
+ neon_poly_getnoise_eta2(&(epp[0]), coins, 8);
neon_polyvec_ntt(sp);
diff --git a/crypto_kem/kyber768/aarch64/indcpa.c b/crypto_kem/kyber768/aarch64/indcpa.c
index 02448809..ff24f150 100644
--- a/crypto_kem/kyber768/aarch64/indcpa.c
+++ b/crypto_kem/kyber768/aarch64/indcpa.c
@@ -160,39 +160,114 @@ static void unpack_ciphertext(int16_t b[KYBER_K][KYBER_N], int16_t *v, const uin
**************************************************/
#define GEN_MATRIX_NBLOCKS ((12*KYBER_N/8*(1 << 12)/KYBER_Q + XOF_BLOCKBYTES)/XOF_BLOCKBYTES)
// Not static for benchmarking
-void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_SYMBYTES], int transposed) {
- unsigned int ctr0, ctr1, k;
- unsigned int buflen, off;
- uint8_t buf0[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2],
- buf1[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2];
- neon_xof_state state;
-
- for (unsigned int i = 0; i < KYBER_K; i++) {
- if (transposed) {
- neon_xof_absorb(&state, seed, i, i, 0, 1);
- } else {
- neon_xof_absorb(&state, seed, 0, 1, i, i);
- }
-
- neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state);
-
- buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES;
-
- ctr0 = neon_rej_uniform(&(a[i][0][0]), buf0);
- ctr1 = neon_rej_uniform(&(a[i][1][0]), buf1);
- while (ctr0 < KYBER_N || ctr1 < KYBER_N) {
- off = buflen % 3;
- for (k = 0; k < off; k++) {
- buf0[k] = buf0[buflen - off + k];
- buf1[k] = buf1[buflen - off + k];
- }
- neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state);
-
- buflen = off + XOF_BLOCKBYTES;
- ctr0 += rej_uniform(&(a[i][0][0]) + ctr0, KYBER_N - ctr0, buf0, buflen);
- ctr1 += rej_uniform(&(a[i][1][0]) + ctr1, KYBER_N - ctr1, buf1, buflen);
- }
+void gen_matrix(int16_t a[KYBER_K][KYBER_K][KYBER_N], const uint8_t seed[KYBER_SYMBYTES], int transposed)
+{
+ unsigned int ctr0, ctr1, k;
+ unsigned int buflen, off;
+ uint8_t buf0[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2],
+ buf1[GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES + 2];
+ neon_xof_state state;
+
+ int16_t *s1 = NULL, *s2 = NULL;
+ unsigned int x1, x2, y1, y2;
+ xof_state c_state;
+ shake128_inc_init(&c_state); // patch
+
+ for (unsigned int j = 0; j < KYBER_K * KYBER_K - 1; j += 2)
+ {
+ switch (j)
+ {
+ case 0:
+ s1 = &(a[0][0][0]);
+ s2 = &(a[0][1][0]);
+ x1 = 0;
+ y1 = 0;
+ x2 = 0;
+ y2 = 1;
+ break;
+ case 2:
+ s1 = &(a[0][2][0]);
+ s2 = &(a[1][0][0]);
+ x1 = 0;
+ y1 = 2;
+ x2 = 1;
+ y2 = 0;
+ break;
+ case 4:
+ s1 = &(a[1][1][0]);
+ s2 = &(a[1][2][0]);
+ x1 = 1;
+ y1 = 1;
+ x2 = 1;
+ y2 = 2;
+ break;
+ default:
+ s1 = &(a[2][0][0]);
+ s2 = &(a[2][1][0]);
+ x1 = 2;
+ y1 = 0;
+ x2 = 2;
+ y2 = 1;
+ break;
}
+
+ if (transposed)
+ neon_xof_absorb(&state, seed, x1, x2, y1, y2);
+ else
+ neon_xof_absorb(&state, seed, y1, y2, x1, x2);
+
+ neon_xof_squeezeblocks(buf0, buf1, GEN_MATRIX_NBLOCKS, &state);
+
+ buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES;
+
+ ctr0 = neon_rej_uniform(s1, buf0);
+ ctr1 = neon_rej_uniform(s2, buf1);
+
+ while (ctr0 < KYBER_N || ctr1 < KYBER_N)
+ {
+ off = buflen % 3;
+ for (k = 0; k < off; k++)
+ {
+ buf0[k] = buf0[buflen - off + k];
+ buf1[k] = buf1[buflen - off + k];
+ }
+ neon_xof_squeezeblocks(buf0 + off, buf1 + off, 1, &state);
+
+ buflen = off + XOF_BLOCKBYTES;
+ ctr0 += rej_uniform(s1 + ctr0, KYBER_N - ctr0, buf0, buflen);
+ ctr1 += rej_uniform(s2 + ctr1, KYBER_N - ctr1, buf1, buflen);
+ }
+ }
+
+ // Last iteration [2][2]
+ if (transposed){
+ xof_absorb(&c_state, seed, 2, 2);
+ }
+ else{
+ xof_absorb(&c_state, seed, 2, 2);
+ }
+
+ xof_squeezeblocks(buf0, GEN_MATRIX_NBLOCKS, &c_state);
+
+ buflen = GEN_MATRIX_NBLOCKS * XOF_BLOCKBYTES;
+
+ ctr0 = neon_rej_uniform(&(a[2][2][0]), buf0);
+
+ while (ctr0 < KYBER_N)
+ {
+ off = buflen % 3;
+ for (k = 0; k < off; k++)
+ {
+ buf0[k] = buf0[buflen - off + k];
+ }
+ xof_squeezeblocks(buf0 + off, 1, &c_state);
+
+ buflen = off + XOF_BLOCKBYTES;
+ ctr0 += rej_uniform(&(a[2][2][0]) + ctr0, KYBER_N - ctr0, buf0, buflen);
+ }
+
+ shake128_inc_ctx_release(&c_state);
+
}
/*************************************************
@@ -224,7 +299,8 @@ void indcpa_keypair(uint8_t pk[KYBER_INDCPA_PUBLICKEYBYTES],
gen_a(a, publicseed);
neon_poly_getnoise_eta1_2x(&(skpv[0][0]), &(skpv[1][0]), noiseseed, 0, 1);
- neon_poly_getnoise_eta1_2x(&(e[0][0]), &(e[1][0]), noiseseed, 2, 3);
+ neon_poly_getnoise_eta1_2x(&(skpv[2][0]), &(e[0][0]), noiseseed, 2, 3);
+ neon_poly_getnoise_eta1_2x(&(e[1][0]), &(e[2][0]), noiseseed, 4, 5);
neon_polyvec_ntt(skpv);
neon_polyvec_ntt(e);
@@ -280,10 +356,11 @@ void indcpa_enc(uint8_t c[KYBER_INDCPA_BYTES],
poly_frommsg(k, m);
gen_at(at, seed);
- // ETA1 != ETA2 (3 != 2)
- neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1);
- neon_poly_getnoise_eta2_2x(&(ep[0][0]), &(ep[1][0]), coins, 2, 3);
- neon_poly_getnoise_eta2(&(epp[0]), coins, 4);
+ // Because ETA1 == ETA2
+ neon_poly_getnoise_eta1_2x(&(sp[0][0]), &(sp[1][0]), coins, 0, 1);
+ neon_poly_getnoise_eta1_2x(&(sp[2][0]), &(ep[0][0]), coins, 2, 3);
+ neon_poly_getnoise_eta1_2x(&(ep[1][0]), &(ep[2][0]), coins, 4, 5);
+ neon_poly_getnoise_eta2(&(epp[0]), coins, 6);
neon_polyvec_ntt(sp);