[TEST] Reduce `crypto_sign_verify_internal` stack usage
This PR reduces the stack usage of crypto_sign_verify_internal based on top of #743 using simple restructuring and sharing buffers.
CBMC proofs are not yet adjusted.
Runtime stack usage for verify, according to tests/stack:
| Level | main | branch | Reduction |
|------------|----------:|----------:|----------:|
| ML-DSA-44 | 43,872 B | 18,288 B | -58% |
| ML-DSA-65 | 69,024 B | 24,960 B | -64% |
| ML-DSA-87 | 108,112 B | 33,328 B | -69% |
I cherry-picked merely verify stack usage: Reuse t1/w1 polyveck, to get a sense of how CBMC's coping with this. Sadly, not well. The proof does not terminate, even if signifcant parts are commented out. Specifically, this still works:
int crypto_sign_verify_internal(const uint8_t *sig, size_t siglen,
const uint8_t *m, size_t mlen,
const uint8_t *pre, size_t prelen,
const uint8_t pk[CRYPTO_PUBLICKEYBYTES],
int externalmu)
{
unsigned int i;
int res;
MLD_ALIGN uint8_t buf[MLDSA_K * MLDSA_POLYW1_PACKEDBYTES];
MLD_ALIGN uint8_t rho[MLDSA_SEEDBYTES];
MLD_ALIGN uint8_t mu[MLDSA_CRHBYTES];
MLD_ALIGN uint8_t c[MLDSA_CTILDEBYTES];
MLD_ALIGN uint8_t c2[MLDSA_CTILDEBYTES];
mld_poly cp;
mld_polymat mat;
mld_polyvecl z;
mld_polyveck tmp, h;
union
{
mld_polyveck t1;
mld_polyveck w1;
} t1w1;
if (siglen != CRYPTO_BYTES)
{
res = -1;
goto cleanup;
}
mld_unpack_pk(rho, &t1w1.t1, pk);
if (mld_unpack_sig(c, &z, &h, sig))
{
res = -1;
goto cleanup;
}
if (mld_polyvecl_chknorm(&z, MLDSA_GAMMA1 - MLDSA_BETA))
{
res = -1;
goto cleanup;
}
if (!externalmu)
{
/* Compute CRH(H(rho, t1), pre, msg) */
MLD_ALIGN uint8_t hpk[MLDSA_CRHBYTES];
mld_H(hpk, MLDSA_TRBYTES, pk, CRYPTO_PUBLICKEYBYTES, NULL, 0, NULL, 0);
mld_H(mu, MLDSA_CRHBYTES, hpk, MLDSA_TRBYTES, pre, prelen, m, mlen);
/* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
mld_zeroize(hpk, sizeof(hpk));
}
else
{
/* mu has been provided directly */
mld_memcpy(mu, m, MLDSA_CRHBYTES);
}
/* Matrix-vector multiplication; compute Az - c2^dt1 */
mld_poly_challenge(&cp, c);
mld_polyvec_matrix_expand(&mat, rho);
mld_poly_ntt(&cp);
mld_polyveck_shiftl(&t1w1.t1);
mld_polyveck_ntt(&t1w1.t1);
mld_polyveck_pointwise_poly_montgomery(&tmp, &cp, &t1w1.t1);
mld_polyvecl_ntt(&z);
mld_polyvec_matrix_pointwise_montgomery(&t1w1.w1, &mat, &z);
mld_polyveck_sub(&t1w1.w1, &tmp);
mld_polyveck_reduce(&t1w1.w1);
mld_polyveck_invntt_tomont(&t1w1.w1);
/* Reconstruct w1 */
mld_polyveck_caddq(&t1w1.w1);
/* mld_polyveck_use_hint(&tmp, &t1w1.w1, &h); */
/* mld_polyveck_pack_w1(buf, &tmp); */
/* /\* Call random oracle and verify challenge *\/ */
/* mld_H(c2, MLDSA_CTILDEBYTES, mu, MLDSA_CRHBYTES, buf, */
/* MLDSA_K * MLDSA_POLYW1_PACKEDBYTES, NULL, 0); */
/* /\* Constant time: All data in verification is usually considered public. */
/* * However, in our constant-time tests we do not declassify the message and */
/* * context string. */
/* * The following conditional is the only place in verification whose run-time */
/* * depends on the message. As all that can be leakaged here is the output of */
/* * a hash call (that should behave like a random oracle), it is safe to */
/* * declassify here even with a secret message. */
/* *\/ */
/* MLD_CT_TESTING_DECLASSIFY(c2, sizeof(c2)); */
/* for (i = 0; i < MLDSA_CTILDEBYTES; ++i) */
/* __loop__( */
/* invariant(i <= MLDSA_CTILDEBYTES) */
/* ) */
/* { */
/* if (c[i] != c2[i]) */
/* { */
/* res = -1; */
/* goto cleanup; */
/* } */
/* } */
res = 0;
cleanup:
/* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
mld_zeroize(buf, sizeof(buf));
mld_zeroize(rho, sizeof(rho));
mld_zeroize(mu, sizeof(mu));
mld_zeroize(c, sizeof(c));
mld_zeroize(c2, sizeof(c2));
mld_zeroize(&cp, sizeof(cp));
mld_zeroize(&mat, sizeof(mat));
mld_zeroize(&z, sizeof(z));
mld_zeroize(&t1w1, sizeof(t1w1));
mld_zeroize(&tmp, sizeof(tmp));
mld_zeroize(&h, sizeof(h));
return res;
}
in 256s (locally), while including the next line
mld_polyveck_use_hint(&tmp, &t1w1.w1, &h);
I have not yet seen it terminate.
@tautschnig indicated that if we don't use unions for representation-conversion, but merely for memory-saving, --slice-formula should do the trick, but we already have this enabled here.
@tautschnig any other suggestions / parameters to set? Otherwise, we may need to do further function splitting to get these memory optimizations.