mldsa-native [TEST] Reduce `crypto_sign_verify

This PR reduces the stack usage of crypto_sign_verify_internal based on top of #743 using simple restructuring and sharing buffers.

CBMC proofs are not yet adjusted.

Nov 29 '25 04:11 mkannwischer

Runtime stack usage for verify, according to tests/stack:

| Level      |      main |    branch | Reduction |
|------------|----------:|----------:|----------:|
| ML-DSA-44  |  43,872 B |  18,288 B |      -58% |
| ML-DSA-65  |  69,024 B |  24,960 B |      -64% |
| ML-DSA-87  | 108,112 B |  33,328 B |      -69% |

Nov 30 '25 19:11 hanno-becker

I cherry-picked merely verify stack usage: Reuse t1/w1 polyveck, to get a sense of how CBMC's coping with this. Sadly, not well. The proof does not terminate, even if signifcant parts are commented out. Specifically, this still works:

int crypto_sign_verify_internal(const uint8_t *sig, size_t siglen,
                                const uint8_t *m, size_t mlen,
                                const uint8_t *pre, size_t prelen,
                                const uint8_t pk[CRYPTO_PUBLICKEYBYTES],
                                int externalmu)
{
  unsigned int i;
  int res;
  MLD_ALIGN uint8_t buf[MLDSA_K * MLDSA_POLYW1_PACKEDBYTES];
  MLD_ALIGN uint8_t rho[MLDSA_SEEDBYTES];
  MLD_ALIGN uint8_t mu[MLDSA_CRHBYTES];
  MLD_ALIGN uint8_t c[MLDSA_CTILDEBYTES];
  MLD_ALIGN uint8_t c2[MLDSA_CTILDEBYTES];
  mld_poly cp;
  mld_polymat mat;
  mld_polyvecl z;
  mld_polyveck tmp, h;
  union
  {
    mld_polyveck t1;
    mld_polyveck w1;
  } t1w1;

  if (siglen != CRYPTO_BYTES)
  {
    res = -1;
    goto cleanup;
  }

  mld_unpack_pk(rho, &t1w1.t1, pk);
  if (mld_unpack_sig(c, &z, &h, sig))
  {
    res = -1;
    goto cleanup;
  }
  if (mld_polyvecl_chknorm(&z, MLDSA_GAMMA1 - MLDSA_BETA))
  {
    res = -1;
    goto cleanup;
  }

  if (!externalmu)
  {
    /* Compute CRH(H(rho, t1), pre, msg) */
    MLD_ALIGN uint8_t hpk[MLDSA_CRHBYTES];
    mld_H(hpk, MLDSA_TRBYTES, pk, CRYPTO_PUBLICKEYBYTES, NULL, 0, NULL, 0);
    mld_H(mu, MLDSA_CRHBYTES, hpk, MLDSA_TRBYTES, pre, prelen, m, mlen);

    /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
    mld_zeroize(hpk, sizeof(hpk));
  }
  else
  {
    /* mu has been provided directly */
    mld_memcpy(mu, m, MLDSA_CRHBYTES);
  }

  /* Matrix-vector multiplication; compute Az - c2^dt1 */
  mld_poly_challenge(&cp, c);
  mld_polyvec_matrix_expand(&mat, rho);

  mld_poly_ntt(&cp);
  mld_polyveck_shiftl(&t1w1.t1);
  mld_polyveck_ntt(&t1w1.t1);

  mld_polyveck_pointwise_poly_montgomery(&tmp, &cp, &t1w1.t1);

  mld_polyvecl_ntt(&z);
  mld_polyvec_matrix_pointwise_montgomery(&t1w1.w1, &mat, &z);

  mld_polyveck_sub(&t1w1.w1, &tmp);
  mld_polyveck_reduce(&t1w1.w1);
  mld_polyveck_invntt_tomont(&t1w1.w1);

  /* Reconstruct w1 */
  mld_polyveck_caddq(&t1w1.w1);
  /* mld_polyveck_use_hint(&tmp, &t1w1.w1, &h); */
  /* mld_polyveck_pack_w1(buf, &tmp); */
  /* /\* Call random oracle and verify challenge *\/ */
  /* mld_H(c2, MLDSA_CTILDEBYTES, mu, MLDSA_CRHBYTES, buf, */
  /*       MLDSA_K * MLDSA_POLYW1_PACKEDBYTES, NULL, 0); */

  /* /\* Constant time: All data in verification is usually considered public. */
  /*  * However, in our constant-time tests we do not declassify the message and */
  /*  * context string. */
  /*  * The following conditional is the only place in verification whose run-time */
  /*  * depends on the message. As all that can be leakaged here is the output of */
  /*  * a hash call (that should behave like a random oracle), it is safe to */
  /*  * declassify here even with a secret message. */
  /*  *\/ */
  /* MLD_CT_TESTING_DECLASSIFY(c2, sizeof(c2)); */
  /* for (i = 0; i < MLDSA_CTILDEBYTES; ++i) */
  /* __loop__( */
  /*   invariant(i <= MLDSA_CTILDEBYTES) */
  /* ) */
  /* { */
  /*   if (c[i] != c2[i]) */
  /*   { */
  /*     res = -1; */
  /*     goto cleanup; */
  /*   } */
  /* } */

  res = 0;

cleanup:
  /* @[FIPS204, Section 3.6.3] Destruction of intermediate values. */
  mld_zeroize(buf, sizeof(buf));
  mld_zeroize(rho, sizeof(rho));
  mld_zeroize(mu, sizeof(mu));
  mld_zeroize(c, sizeof(c));
  mld_zeroize(c2, sizeof(c2));
  mld_zeroize(&cp, sizeof(cp));
  mld_zeroize(&mat, sizeof(mat));
  mld_zeroize(&z, sizeof(z));
  mld_zeroize(&t1w1, sizeof(t1w1));
  mld_zeroize(&tmp, sizeof(tmp));
  mld_zeroize(&h, sizeof(h));
  return res;
}

in 256s (locally), while including the next line

mld_polyveck_use_hint(&tmp, &t1w1.w1, &h);

I have not yet seen it terminate.

@tautschnig indicated that if we don't use unions for representation-conversion, but merely for memory-saving, --slice-formula should do the trick, but we already have this enabled here.

@tautschnig any other suggestions / parameters to set? Otherwise, we may need to do further function splitting to get these memory optimizations.

Dec 01 '25 16:12 hanno-becker

[TEST] Reduce `crypto_sign_verify_internal` stack usage